1 // =============================================================================
2 // CD-HI/CD-HIT
3 //
4 // Cluster Database at High Identity
5 //
6 // CD-HIT clusters protein sequences at high identity threshold.
7 // This program can remove the high sequence redundance efficiently.
8 //
9 // program written by
10 // Weizhong Li
11 // UCSD, San Diego Supercomputer Center
12 // La Jolla, CA, 92093
13 // Email liwz@sdsc.edu
14 //
15 // at
16 // Adam Godzik's lab
17 // The Burnham Institute
18 // La Jolla, CA, 92037
19 // Email adam@burnham-inst.org
20 //
21 // modified by:
22 // Limin Fu
23 // Center for Research in Biological Systems (CRBS), UCSD
24 // La Jolla, CA, 92093
25 // Email: l2fu@ucsd.edu, fu@daovm.net
26 // =============================================================================
27
28 #include "cdhit-common.h"
29 #include<valarray>
30 #include<stdint.h>
31 #include<assert.h>
32 #include<limits.h>
33
34 #ifndef NO_OPENMP
35
36 #include<omp.h>
37
38 #define WITH_OPENMP " (+OpenMP)"
39
40 #else
41
42 #define WITH_OPENMP ""
43
44 #define omp_set_num_threads(T) (T = T)
45 #define omp_get_thread_num() 0
46
47 #endif
48
49 //class function definition
50 const char aa[] = {"ARNDCQEGHILKMFPSTWYVBZX"};
51 //{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,2,6,20};
52 int aa2idx[] = {0, 2, 4, 3, 6, 13,7, 8, 9,20,11,10,12, 2,20,14,
53 5, 1,15,16,20,19,17,20,18, 6};
54 // idx for A B C D E F G H I J K L M N O P
55 // Q R S T U V W X Y Z
56 // so aa2idx[ X - 'A'] => idx_of_X, eg aa2idx['A' - 'A'] => 0,
57 // and aa2idx['M'-'A'] => 12
58
59 int BLOSUM62[] = {
60 4, // A
61 -1, 5, // R
62 -2, 0, 6, // N
63 -2,-2, 1, 6, // D
64 0,-3,-3,-3, 9, // C
65 -1, 1, 0, 0,-3, 5, // Q
66 -1, 0, 0, 2,-4, 2, 5, // E
67 0,-2, 0,-1,-3,-2,-2, 6, // G
68 -2, 0, 1,-1,-3, 0, 0,-2, 8, // H
69 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, // I
70 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4, // L
71 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5, // K
72 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, // M
73 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6, // F
74 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7, // P
75 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, // S
76 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5, // T
77 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, // W
78 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7, // Y
79 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4, // V
80 -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4, // B
81 -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4, // Z
82 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1 // X
83 //A R N D C Q E G H I L K M F P S T W Y V B Z X
84 //0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 2 6 20
85 };
86
87
88 int na2idx[] = {0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4,
89 4, 4, 4, 3, 3, 4, 4, 4, 4, 4};
90 // idx for A B C D E F G H I J K L M N O P
91 // Q R S T U V W X Y Z
92 // so aa2idx[ X - 'A'] => idx_of_X, eg aa2idx['A' - 'A'] => 0,
93 // and aa2idx['M'-'A'] => 4
94 int BLOSUM62_na[] = {
95 2, // A
96 -2, 2, // C
97 -2,-2, 2, // G
98 -2,-2,-2, 2, // T
99 -2,-2,-2, 1, 2, // U
100 -2,-2,-2,-2,-2, 1, // N
101 0, 0, 0, 0, 0, 0, 1 // X
102 //A C G T U N X
103 //0 1 2 3 3 4 5
104 };
105
106 void setaa_to_na();
107
108 struct TempFile
109 {
110 FILE *file;
111 char buf[512];
112
TempFileTempFile113 TempFile( const char *dir = NULL ){
114 int len = dir ? strlen( dir ) : 0;
115 assert( len < 400 );
116 buf[0] = 0;
117 if( len ){
118 strcat( buf, dir );
119 if( buf[len-1] != '/' && buf[len-1] != '\\' ){
120 buf[len] = '/';
121 len += 1;
122 }
123 }
124 strcat( buf, "cdhit.temp." );
125 len += 11;
126 sprintf( buf + len, "%p", this );
127 file = fopen( buf, "w+" );
128 }
~TempFileTempFile129 ~TempFile(){
130 if( file ){
131 fclose( file );
132 remove( buf );
133 }
134 }
135 };
136
137 struct TempFiles
138 {
139 NVector<TempFile*> files;
140
~TempFilesTempFiles141 ~TempFiles(){ Clear(); }
142
ClearTempFiles143 void Clear(){
144 int i;
145 #pragma omp critical
146 {
147 for(i=0; i<files.size; i++) if( files[i] ) delete files[i];
148 files.Clear();
149 }
150 }
151 };
152
153 const char *temp_dir = "";
154 TempFiles temp_files;
155
OpenTempFile(const char * dir=NULL)156 FILE* OpenTempFile( const char *dir = NULL )
157 {
158 TempFile *file = new TempFile( dir );
159 #pragma omp critical
160 {
161 temp_files.files.Append( file );
162 }
163 return file->file;
164 }
CleanUpTempFiles()165 static void CleanUpTempFiles()
166 {
167 if( temp_files.files.Size() ) printf( "Clean up temporary files ...\n" );
168 temp_files.Clear();
169 }
170
171 int NAA1 ;
172 int NAA2 ;
173 int NAA3 ;
174 int NAA4 ;
175 int NAA5 ;
176 int NAA6 ;
177 int NAA7 ;
178 int NAA8 ;
179 int NAA9 ;
180 int NAA10;
181 int NAA11;
182 int NAA12;
183 int NAAN_array[13] = { 1 };
184
InitNAA(int max)185 void InitNAA( int max )
186 {
187 NAA1 = NAAN_array[1] = max;
188 NAA2 = NAAN_array[2] = NAA1 * NAA1;
189 NAA3 = NAAN_array[3] = NAA1 * NAA2;
190 NAA4 = NAAN_array[4] = NAA2 * NAA2;
191 NAA5 = NAAN_array[5] = NAA2 * NAA3;
192 NAA6 = NAAN_array[6] = NAA3 * NAA3;
193 NAA7 = NAAN_array[7] = NAA3 * NAA4;
194 NAA8 = NAAN_array[8] = NAA4 * NAA4;
195 NAA9 = NAAN_array[9] = NAA4 * NAA5;
196 NAA10 = NAAN_array[10] = NAA5 * NAA5;
197 NAA11 = NAAN_array[11] = NAA5 * NAA6;
198 NAA12 = NAAN_array[12] = NAA6 * NAA6;
199 }
200
201 extern Options options;
202 ScoreMatrix mat;
203 Vector<int> Comp_AAN_idx;
204
make_comp_iseq(int len,char * iseq_comp,char * iseq)205 void make_comp_iseq(int len, char *iseq_comp, char *iseq)
206 {
207 int i, c[6] = {3,2,1,0,4,5};
208 for (i=0; i<len; i++) iseq_comp[i] = c[ (int)iseq[len-i-1] ];
209 } // make_comp_iseq
210
SetOptionCommon(const char * flag,const char * value)211 bool Options::SetOptionCommon( const char *flag, const char *value )
212 {
213 int intval = atoi( value );
214 if (strcmp(flag, "-i" ) == 0) input = value;
215 else if (strcmp(flag, "-j" ) == 0) input_pe = value;
216 else if (strcmp(flag, "-o" ) == 0) output = value;
217 else if (strcmp(flag, "-op") == 0) output_pe = value;
218 else if (strcmp(flag, "-M" ) == 0) max_memory = atoll(value) * 1000000;
219 else if (strcmp(flag, "-l" ) == 0) min_length = intval;
220 else if (strcmp(flag, "-c" ) == 0) cluster_thd = atof(value), useIdentity = true;
221 else if (strcmp(flag, "-D" ) == 0) distance_thd = atof(value), useDistance = true;
222 else if (strcmp(flag, "-b" ) == 0) band_width = intval;
223 else if (strcmp(flag, "-n" ) == 0) NAA = intval;
224 else if (strcmp(flag, "-d" ) == 0) des_len = intval;
225 else if (strcmp(flag, "-s" ) == 0) diff_cutoff = atof(value);
226 else if (strcmp(flag, "-S" ) == 0) diff_cutoff_aa = intval;
227 else if (strcmp(flag, "-B" ) == 0) store_disk = intval;
228 else if (strcmp(flag, "-P" ) == 0) PE_mode = intval;
229 else if (strcmp(flag, "-cx") == 0) trim_len = intval;
230 else if (strcmp(flag, "-cy") == 0) trim_len_R2 = intval;
231 else if (strcmp(flag, "-ap") == 0) align_pos = intval;
232 else if (strcmp(flag, "-sc") == 0) sort_output = intval;
233 else if (strcmp(flag, "-sf") == 0) sort_outputf = intval;
234 else if (strcmp(flag, "-p" ) == 0) print = intval;
235 else if (strcmp(flag, "-g" ) == 0) cluster_best = intval;
236 else if (strcmp(flag, "-G" ) == 0) global_identity = intval;
237 else if (strcmp(flag, "-aL") == 0) long_coverage = atof(value);
238 else if (strcmp(flag, "-AL") == 0) long_control = intval;
239 else if (strcmp(flag, "-aS") == 0) short_coverage = atof(value);
240 else if (strcmp(flag, "-AS") == 0) short_control = intval;
241 else if (strcmp(flag, "-A" ) == 0) min_control = intval;
242 else if (strcmp(flag, "-uL") == 0) long_unmatch_per = atof(value);
243 else if (strcmp(flag, "-uS") == 0) short_unmatch_per = atof(value);
244 else if (strcmp(flag, "-U") == 0) unmatch_len = intval;
245 else if (strcmp(flag, "-tmp" ) == 0) temp_dir = value;
246 else if (strcmp(flag, "-bak" ) == 0) backupFile = intval;
247 else if (strcmp(flag, "-T" ) == 0){
248 #ifndef NO_OPENMP
249 int cpu = omp_get_num_procs();
250 threads = intval;
251 if( threads > cpu ){
252 threads = cpu;
253 printf( "Warning: total number of CPUs in the system is %i\n", cpu );
254 }else if( threads < 0 ){
255 threads += cpu;
256 if( threads < 0 ) threads = 0;
257 }
258 if( threads == 0 ){
259 threads = cpu;
260 printf( "total number of CPUs in the system is %i\n", cpu );
261 }
262 if( threads != intval ) printf( "Actual number of CPUs to be used: %i\n\n", threads );
263 #else
264 printf( "Option -T is ignored: multi-threading with OpenMP is NOT enabled!\n" );
265 #endif
266 }else return false;
267 return true;
268 }
SetOption(const char * flag,const char * value)269 bool Options::SetOption( const char *flag, const char *value )
270 {
271 if( is454 ){
272 if( strcmp(flag, "-s") == 0 ) return false;
273 else if( strcmp(flag, "-S") == 0 ) return false;
274 else if( strcmp(flag, "-G") == 0 ) return false;
275 else if( strcmp(flag, "-A") == 0 ) return false;
276 else if( strcmp(flag, "-r") == 0 ) return false;
277 else if( strcmp(flag, "-D") == 0 ){ max_indel = atoi(value); return true; }
278 }
279 if( SetOptionCommon( flag, value ) ) return true;
280 if (strcmp(flag, "-t" ) == 0) tolerance = atoi(value);
281 else if (strcmp(flag, "-F" ) == 0) frag_size = atoi(value);
282 else if (has2D && SetOption2D( flag, value ) ) return true;
283 else if (isEST && SetOptionEST( flag, value ) ) return true;
284 else return false;
285 return true;
286 }
SetOption2D(const char * flag,const char * value)287 bool Options::SetOption2D( const char *flag, const char *value )
288 {
289 if( SetOptionCommon( flag, value ) ) return true;
290 if (strcmp(flag, "-i2" ) == 0) input2 = value;
291 else if (strcmp(flag, "-j2" ) == 0) input2_pe = value;
292 else if (strcmp(flag, "-s2") == 0) diff_cutoff2 = atof(value);
293 else if (strcmp(flag, "-S2") == 0) diff_cutoff_aa2 = atoi(value);
294 else return false;
295 return true;
296 }
SetOptionEST(const char * flag,const char * value)297 bool Options::SetOptionEST( const char *flag, const char *value )
298 {
299 NAA_top_limit = 12;
300 if( SetOptionCommon( flag, value ) ) return true;
301 if (strcmp(flag, "-r" ) == 0) option_r = atoi(value);
302 else if (strcmp(flag, "-gap") == 0) mat.gap = MAX_SEQ * atoi(value);
303 else if (strcmp(flag, "-gap-ext") == 0) mat.ext_gap = MAX_SEQ * atoi(value);
304 else if (strcmp(flag, "-match") == 0) mat.set_match( atoi(value) );
305 else if (strcmp(flag, "-mismatch") == 0) mat.set_mismatch( atoi(value) );
306 else if (strcmp(flag, "-mask") == 0){
307 string letters = value;
308 int i, n = letters.size();
309 for(i=0; i<n; i++){
310 char ch = toupper( letters[i] );
311 if( ch < 'A' || ch > 'Z' ) continue;
312 na2idx[ ch - 'A' ] = 5;
313 }
314 setaa_to_na();
315 }else return false;
316 return true;
317 }
SetOptions(int argc,char * argv[],bool twod,bool est)318 bool Options::SetOptions( int argc, char *argv[], bool twod, bool est )
319 {
320 int i, n;
321 char date[100];
322 strcpy( date, __DATE__ );
323 n = strlen( date );
324 for(i=1; i<n; i++) if( date[i-1] == ' ' and date[i] == ' ' ) date[i] = '0';
325 printf( "================================================================\n" );
326 printf( "Program: CD-HIT, V" CDHIT_VERSION WITH_OPENMP ", %s, " __TIME__ "\n", date );
327 printf( "Command:" );
328 n = 9;
329 for(i=0; i<argc; i++){
330 n += strlen( argv[i] ) + 1;
331 if( n >= 64 ){
332 printf( "\n %s", argv[i] );
333 n = strlen( argv[i] ) + 9;
334 }else{
335 printf( " %s", argv[i] );
336 }
337 }
338 printf( "\n\n" );
339 time_t tm = time(NULL);
340 printf( "Started: %s", ctime(&tm) );
341 printf( "================================================================\n" );
342 printf( " Output \n" );
343 printf( "----------------------------------------------------------------\n" );
344 has2D = twod;
345 isEST = est;
346 for (i=1; i+1<argc; i+=2) if ( SetOption( argv[i], argv[i+1] ) == 0) return false;
347 if( i < argc ) return false;
348
349 atexit( CleanUpTempFiles );
350 return true;
351 }
Validate()352 void Options::Validate()
353 {
354 if( useIdentity and useDistance ) bomb_error( "can not use both identity cutoff and distance cutoff" );
355 if( useDistance ){
356 if ((distance_thd > 1.0) || (distance_thd < 0.0)) bomb_error("invalid distance threshold");
357 }else if( isEST ){
358 if ((cluster_thd > 1.0) || (cluster_thd < 0.8)) bomb_error("invalid clstr threshold, should >=0.8");
359 }else{
360 if ((cluster_thd > 1.0) || (cluster_thd < 0.4)) bomb_error("invalid clstr");
361 }
362
363 if (input.size() == 0) bomb_error("no input file");
364 if (output.size() == 0) bomb_error("no output file");
365 if (PE_mode) {
366 if (input_pe.size() == 0) bomb_error("no input file for R2 sequences in PE mode");
367 if (output_pe.size() == 0) bomb_error("no output file for R2 sequences in PE mode");
368 }
369 if (isEST && (align_pos==1)) option_r = 0;
370
371 if (band_width < 1 ) bomb_error("invalid band width");
372 if (NAA < 2 || NAA > NAA_top_limit) bomb_error("invalid word length");
373 if (des_len < 0 ) bomb_error("too short description, not enough to identify sequences");
374 if (not isEST && (tolerance < 0 || tolerance > 5) ) bomb_error("invalid tolerance");
375 if ((diff_cutoff<0) || (diff_cutoff>1)) bomb_error("invalid value for -s");
376 if (diff_cutoff_aa<0) bomb_error("invalid value for -S");
377 if( has2D ){
378 if ((diff_cutoff2<0) || (diff_cutoff2>1)) bomb_error("invalid value for -s2");
379 if (diff_cutoff_aa2<0) bomb_error("invalid value for -S2");
380 if (PE_mode) {
381 if (input2_pe.size() == 0) bomb_error("no input file for R2 sequences for 2nd db in PE mode");
382 }
383 }
384 if (global_identity == 0) print = 1;
385 if (short_coverage < long_coverage) short_coverage = long_coverage;
386 if (short_control > long_control) short_control = long_control;
387 if ((global_identity == 0) && (short_coverage == 0.0) && (min_control == 0))
388 bomb_error("You are using local identity, but no -aS -aL -A option");
389 if (frag_size < 0) bomb_error("invalid fragment size");
390
391 #if 0
392 if( useDistance ){
393 /* when required_aan becomes zero */
394 if( distance_thd * NAA >= 1 )
395 bomb_warning( "word length is too long for the distance cutoff" );
396 }else{
397 /* when required_aan becomes zero */
398 if( cluster_thd <= 1.0 - 1.0 / NAA )
399 bomb_warning( "word length is too long for the identity cutoff" );
400 }
401 #endif
402
403 const char *message = "Your word length is %i, using %i may be faster!\n";
404 if ( not isEST && tolerance ) {
405 int i, clstr_idx = (int) (cluster_thd * 100) - naa_stat_start_percent;
406 int tcutoff = naa_stat[tolerance-1][clstr_idx][5-NAA];
407
408 if (tcutoff < 5 )
409 bomb_error("Too low cluster threshold for the word length.\n"
410 "Increase the threshold or the tolerance, or decrease the word length.");
411 for ( i=5; i>NAA; i--) {
412 if ( naa_stat[tolerance-1][clstr_idx][5-i] > 10 ) {
413 printf( message, NAA, i );
414 break;
415 }
416 }
417 } else if( isEST ) {
418 if ( cluster_thd > 0.9 && NAA < 8 ) printf( message, NAA, 8 );
419 else if ( cluster_thd > 0.87 && NAA < 5 ) printf( message, NAA, 5 );
420 else if ( cluster_thd > 0.80 && NAA < 4 ) printf( message, NAA, 4 );
421 else if ( cluster_thd > 0.75 && NAA < 3 ) printf( message, NAA, 3 );
422 } else {
423 if ( cluster_thd > 0.85 && NAA < 5 ) printf( message, NAA, 5 );
424 else if ( cluster_thd > 0.80 && NAA < 4 ) printf( message, NAA, 4 );
425 else if ( cluster_thd > 0.75 && NAA < 3 ) printf( message, NAA, 3 );
426 }
427
428 if ( (min_length + 1) < NAA ) bomb_error("Too short -l, redefine it");
429 }
Print()430 void Options::Print()
431 {
432 printf( "isEST = %i\n", isEST );
433 printf( "has2D = %i\n", has2D );
434 printf( "NAA = %i\n", NAA );
435 printf( "NAA_top_limit = %i\n", NAA_top_limit );
436 printf( "min_length = %i\n", min_length );
437 printf( "cluster_best = %i\n", cluster_best );
438 printf( "global_identity = %i\n", global_identity );
439 printf( "cluster_thd = %g\n", cluster_thd );
440 printf( "diff_cutoff = %g\n", diff_cutoff );
441 printf( "diff_cutoff_aa = %i\n", diff_cutoff_aa );
442 printf( "tolerance = %i\n", tolerance );
443 printf( "long_coverage = %g\n", long_coverage );
444 printf( "long_control = %i\n", long_control );
445 printf( "short_coverage = %g\n", short_coverage );
446 printf( "short_control = %i\n", short_control );
447 printf( "frag_size = %i\n", frag_size );
448 printf( "option_r = %i\n", option_r );
449 printf( "print = %i\n", print );
450 }
451
bomb_error(const char * message)452 void bomb_error(const char *message)
453 {
454 fprintf( stderr, "\nFatal Error:\n%s\nProgram halted !!\n\n", message );
455 temp_files.Clear();
456 exit (1);
457 } // END void bomb_error
458
bomb_error(const char * message,const char * message2)459 void bomb_error(const char *message, const char *message2)
460 {
461 fprintf( stderr, "\nFatal Error:\n%s %s\nProgram halted !!\n\n", message, message2 );
462 temp_files.Clear();
463 exit (1);
464 } // END void bomb_error
465
466
bomb_warning(const char * message)467 void bomb_warning(const char *message)
468 {
469 fprintf( stderr, "\nWarning:\n%s\nNot fatal, but may affect results !!\n\n", message );
470 } // END void bomb_warning
471
472
bomb_warning(const char * message,const char * message2)473 void bomb_warning(const char *message, const char *message2)
474 {
475 fprintf( stderr, "\nWarning:\n%s %s\nNot fatal, but may affect results !!\n\n", message, message2 );
476 } // END void bomb_warning
477
format_seq(char * seq)478 void format_seq(char *seq)
479 {
480 int i, j;
481 char c1;
482 int len = strlen(seq);
483
484 for (i=0,j=0; i<len; i++) {
485 c1 = toupper(seq[i]);
486 if ( isalpha(c1) ) seq[j++] = c1;
487 }
488 seq[j] = 0;
489 } // END void format_seq
490
strrev(char * p)491 void strrev(char *p)
492 {
493 char *q = p;
494 while(q && *q) ++q;
495 for(--q; p < q; ++p, --q)
496 *p = *p ^ *q,
497 *q = *p ^ *q,
498 *p = *p ^ *q;
499 }
500
501 ////For smiple len1 <= len2, len2 is for existing representative
502 ////walk along all diag path of two sequences,
503 ////find the diags with most aap
504 ////return top n diags
505 ////added on 2006 11 13
506 ////band 0 XXXXXXXXXXXXXXXXXX seq2, rep seq
507 //// XXXXXXXXXXXXXXX seq1
508 ////band 1 XXXXXXXXXXXXXXXXXX seq2, rep seq
509 //// XXXXXXXXXXXXXXX seq1
510 ////extreme right (+) XXXXXXXXXXXXXXXXXX seq2, rep seq
511 //// band = len2-1 XXXXXXXXXXXXXXX seq1
512 ////band-1 XXXXXXXXXXXXXXXXXX seq2, rep seq
513 //// XXXXXXXXXXXXXXX seq1
514 ////extreme left (-) XXXXXXXXXXXXXXXXXX seq2, rep seq
515 //// XXXXXXXXXXXXXXX band = -(len1-1) seq1
516 ////index of diag_score = band+len1-1;
diag_test_aapn(int NAA1,char iseq2[],int len1,int len2,WorkingBuffer & buffer,int & best_sum,int band_width,int & band_left,int & band_center,int & band_right,int required_aa1)517 int diag_test_aapn(int NAA1, char iseq2[], int len1, int len2, WorkingBuffer & buffer,
518 int &best_sum, int band_width, int &band_left, int &band_center, int &band_right, int required_aa1)
519 {
520 int i, i1, j, k;
521 int *pp;
522 int nall = len1+len2-1;
523 Vector<int> & taap = buffer.taap;
524 Vector<INTs> & aap_begin = buffer.aap_begin;
525 Vector<INTs> & aap_list = buffer.aap_list;
526 Vector<int> & diag_score = buffer.diag_score;
527 Vector<int> & diag_score2 = buffer.diag_score2;
528
529 if (nall > MAX_DIAG) bomb_error("in diag_test_aapn, MAX_DIAG reached");
530 for (pp=&diag_score[0], i=nall; i; i--, pp++) *pp=0;
531 for (pp=&diag_score2[0], i=nall; i; i--, pp++) *pp=0;
532
533 int c22, cpx;
534 INTs *bip;
535 int len11 = len1-1;
536 int len22 = len2-1;
537 i1 = len11;
538 for (i=0; i<len22; i++,i1++) {
539 c22 = iseq2[i]*NAA1+ iseq2[i+1];
540 cpx = 1 + (iseq2[i] != iseq2[i+1]);
541 if ( (j=taap[c22]) == 0) continue;
542 int m = aap_begin[c22];
543 for(int k=0; k<j; k++){
544 diag_score[ i1 - aap_list[m+k] ] ++;
545 diag_score2[ i1 - aap_list[m+k] ] += cpx;
546 }
547 }
548
549 //find the best band range
550 // int band_b = required_aa1;
551 int band_b = required_aa1-1 >= 0 ? required_aa1-1:0; // on dec 21 2001
552 int band_e = nall - band_b;
553
554 int band_m = ( band_b+band_width-1 < band_e ) ? band_b+band_width-1 : band_e;
555 int best_score=0;
556 int best_score2=0;
557 int max_diag = 0;
558 int max_diag2 = 0;
559 int imax_diag = 0;
560 for (i=band_b; i<=band_m; i++){
561 best_score += diag_score[i];
562 best_score2 += diag_score2[i];
563 if( diag_score2[i] > max_diag2 ){
564 max_diag2 = diag_score2[i];
565 max_diag = diag_score[i];
566 imax_diag = i;
567 }
568 }
569 int from=band_b;
570 int end =band_m;
571 int score = best_score;
572 int score2 = best_score2;
573 for (k=from, j=band_m+1; j<band_e; j++, k++) {
574 score -= diag_score[k];
575 score += diag_score[j];
576 score2 -= diag_score2[k];
577 score2 += diag_score2[j];
578 if ( score2 > best_score2 ) {
579 from = k + 1;
580 end = j;
581 best_score = score;
582 best_score2 = score2;
583 if( diag_score2[j] > max_diag2 ){
584 max_diag2 = diag_score2[j];
585 max_diag = diag_score[j];
586 imax_diag = j;
587 }
588 }
589 }
590 int mlen = imax_diag;
591 if( imax_diag > len1 ) mlen = nall - imax_diag;
592 int emax = int((1.0 - options.cluster_thd) * mlen) + 1;
593 for (j=from; j<imax_diag; j++) { // if aap pairs fail to open gap
594 if ( (imax_diag - j) > emax || diag_score[j] < 1 ) {
595 best_score -= diag_score[j]; from++;
596 } else break;
597 }
598 for (j=end; j>imax_diag; j--) { // if aap pairs fail to open gap
599 if ( (j - imax_diag) > emax || diag_score[j] < 1 ) {
600 best_score -= diag_score[j]; end--;
601 } else break;
602 }
603
604 // delete [] diag_score;
605 band_left = from - len1 + 1;
606 band_right= end - len1 + 1;
607 band_center = imax_diag - len1 + 1;
608 best_sum = best_score;
609 return OK_FUNC;
610 }
611 // END diag_test_aapn
612
613
diag_test_aapn_est(int NAA1,char iseq2[],int len1,int len2,WorkingBuffer & buffer,int & best_sum,int band_width,int & band_left,int & band_center,int & band_right,int required_aa1)614 int diag_test_aapn_est(int NAA1, char iseq2[], int len1, int len2, WorkingBuffer & buffer,
615 int &best_sum, int band_width, int &band_left, int &band_center, int &band_right, int required_aa1)
616 {
617 int i, i1, j, k;
618 int *pp, *pp2;
619 int nall = len1+len2-1;
620 int NAA2 = NAA1 * NAA1;
621 int NAA3 = NAA2 * NAA1;
622 Vector<int> & taap = buffer.taap;
623 Vector<INTs> & aap_begin = buffer.aap_begin;
624 Vector<INTs> & aap_list = buffer.aap_list;
625 Vector<int> & diag_score = buffer.diag_score;
626 Vector<int> & diag_score2 = buffer.diag_score2;
627
628 if (nall > MAX_DIAG) bomb_error("in diag_test_aapn_est, MAX_DIAG reached");
629 pp = & diag_score[0];
630 pp2 = & diag_score2[0];
631 for (i=nall; i; i--, pp++, pp2++) *pp = *pp2 =0;
632
633 INTs *bip;
634 int c22, cpx;
635 int len22 = len2-3;
636 i1 = len1-1;
637 for (i=0; i<len22; i++,i1++,iseq2++) {
638 unsigned char c0 = iseq2[0];
639 unsigned char c1 = iseq2[1];
640 unsigned char c2 = iseq2[2];
641 unsigned char c3 = iseq2[3];
642 if ((c0>=4) || (c1>=4) || (c2>=4) || (c3>=4)) continue; //skip N
643
644 c22 = c0*NAA3+ c1*NAA2 + c2*NAA1 + c3;
645 if ( (j=taap[c22]) == 0) continue;
646 cpx = 1 + (c0 != c1) + (c1 != c2) + (c2 != c3);
647 bip = & aap_list[ aap_begin[c22] ]; // bi = aap_begin[c22];
648 for (; j; j--, bip++) {
649 diag_score[i1 - *bip]++;
650 diag_score2[i1 - *bip] += cpx;
651 }
652 }
653 #if 0
654 int mmax = 0;
655 int immax = 0;
656 for(i=0; i<=nall; i++){
657 if( i%len2 ==0 or i == nall ) printf( "\n" );
658 printf( "%3i ", diag_score[i] );
659 if( diag_score[i] > mmax ){
660 mmax = diag_score[i];
661 immax = i;
662 }
663 }
664 #endif
665
666 //find the best band range
667 // int band_b = required_aa1;
668 int band_b = required_aa1-1 >= 0 ? required_aa1-1:0; // on dec 21 2001
669 int band_e = nall - band_b;
670
671 if( options.is454 ){
672 band_b = len1 - band_width;
673 band_e = len1 + band_width;
674 if( band_b < 0 ) band_b = 0;
675 if( band_e > nall ) band_e = nall;
676 }
677
678 int band_m = ( band_b+band_width-1 < band_e ) ? band_b+band_width-1 : band_e;
679 int best_score=0;
680 int best_score2=0;
681 int max_diag = 0;
682 int max_diag2 = 0;
683 int imax_diag = 0;
684 for (i=band_b; i<=band_m; i++){
685 best_score += diag_score[i];
686 best_score2 += diag_score2[i];
687 if( diag_score2[i] > max_diag2 ){
688 max_diag2 = diag_score2[i];
689 max_diag = diag_score[i];
690 imax_diag = i;
691 }
692 }
693 int from=band_b;
694 int end =band_m;
695 int score = best_score;
696 int score2 = best_score2;
697
698 for (k=from, j=band_m+1; j<band_e; j++, k++) {
699 score -= diag_score[k];
700 score += diag_score[j];
701 score2 -= diag_score2[k];
702 score2 += diag_score2[j];
703 if ( score2 > best_score2 ) {
704 from = k + 1;
705 end = j;
706 best_score = score;
707 best_score2 = score2;
708 if( diag_score2[j] > max_diag2 ){
709 max_diag2 = diag_score2[j];
710 max_diag = diag_score[j];
711 imax_diag = j;
712 }
713 }
714 }
715 #if 0
716 printf( "%i %i\n", required_aa1, from );
717 printf( "max=%3i imax=%3i; band: %3i %3i %i\n", max_diag, imax_diag, band_b, band_e, band_m );
718 printf( "best: %i\n", best_score );
719 printf( "from: %i, end: %i, best: %i\n", from, end, best_score );
720 #endif
721 int mlen = imax_diag;
722 if( imax_diag > len1 ) mlen = nall - imax_diag;
723 int emax = int((1.0 - options.cluster_thd) * mlen) + 1;
724 for (j=from; j<imax_diag; j++) { // if aap pairs fail to open gap
725 if ( (imax_diag - j) > emax || diag_score[j] < 1 ) {
726 best_score -= diag_score[j]; from++;
727 } else break;
728 }
729 for (j=end; j>imax_diag; j--) { // if aap pairs fail to open gap
730 if ( (j - imax_diag) > emax || diag_score[j] < 1 ) {
731 best_score -= diag_score[j]; end--;
732 } else break;
733 }
734
735 band_left = from-len1+1;
736 band_right= end-len1+1;
737 band_center = imax_diag - len1 + 1;
738 best_sum = best_score;
739 if( options.is454 ){
740 if( band_left > 0 ) best_sum = 0;
741 if( band_right < 0 ) best_sum = 0;
742 }
743 #if 0
744 printf( "%3i: best: %i, %i %i %i\n", required_aa1, best_score, band_left, band_right, band_width );
745 printf( "max=%3i imax=%3i; band: %3i %3i %i\n", mmax, immax, band_b, band_e, band_m );
746 #endif
747 return OK_FUNC;
748 }
749 // END diag_test_aapn_est
750
751
752 /*
753 local alignment of two sequence within a diag band
754 for band 0 means direction (0,0) -> (1,1)
755 1 means direction (0,1) -> (1,2)
756 -1 means direction (1,0) -> (2,1)
757 added on 2006 11 13
758 band 0 XXXXXXXXXXXXXXXXXX seq2, rep seq
759 XXXXXXXXXXXXXXX seq1
760 band 1 XXXXXXXXXXXXXXXXXX seq2, rep seq
761 XXXXXXXXXXXXXXX seq1
762 extreme right (+) XXXXXXXXXXXXXXXXXX seq2, rep seq
763 band = len2-1 XXXXXXXXXXXXXXX seq1
764 band-1 XXXXXXXXXXXXXXXXXX seq2, rep seq
765 XXXXXXXXXXXXXXX seq1
766 extreme left (-) XXXXXXXXXXXXXXXXXX seq2, rep seq
767 XXXXXXXXXXXXXXX band = -(len1-1) seq1
768 iseq len are integer sequence and its length,
769 mat is matrix, return ALN_PAIR class
770
771 band: -101 seq2 len2 = 17
772 \\\1234567890123456
773 0 \xxxxxxxxxxxxxxxxx
774 1 xxxxxxxxxxxxxxxxx\ most right band = len2-1
775 2 xxxxxxxxxxxxxxxxx
776 seq1 3 xxxxxxxxxxxxxxxxx
777 len1 = 11 4 xxxxxxxxxxxxxxxxx
778 5 xxxxxxxxxxxxxxxxx
779 6 xxxxxxxxxxxxxxxxx
780 7 xxxxxxxxxxxxxxxxx
781 8 xxxxxxxxxxxxxxxxx
782 9 xxxxxxxxxxxxxxxxx
783 0 xxxxxxxxxxxxxxxxx
784 \
785 most left band = -(len1-1)
786
787 */
788
local_band_align(char iseq1[],char iseq2[],int len1,int len2,ScoreMatrix & mat,int & best_score,int & iden_no,int & alnln,float & dist,int * alninfo,int band_left,int band_center,int band_right,WorkingBuffer & buffer)789 int local_band_align( char iseq1[], char iseq2[], int len1, int len2, ScoreMatrix &mat,
790 int &best_score, int &iden_no, int &alnln, float &dist, int *alninfo,
791 int band_left, int band_center, int band_right, WorkingBuffer & buffer)
792 {
793 int i, j, k, j1;
794 int jj, kk;
795 int iden_no1;
796 int64_t best_score1;
797 iden_no = 0;
798
799 if ( (band_right >= len2 ) ||
800 (band_left <= -len1) ||
801 (band_left > band_right) ) return FAILED_FUNC;
802
803 // allocate mem for score_mat[len1][len2] etc
804 int band_width = band_right - band_left + 1;
805 int band_width1 = band_width + 1;
806
807 // score_mat, back_mat [i][j]: i index of seqi (0 to len(seqi)-1), j index of band (0 to band_width-1)
808 MatrixInt64 & score_mat = buffer.score_mat;
809 MatrixInt & back_mat = buffer.back_mat;
810
811 //printf( "%i %i\n", band_right, band_left );
812
813 if( score_mat.size() <= len1 ){
814 VectorInt row( band_width1, 0 );
815 VectorInt64 row2( band_width1, 0 );
816 while( score_mat.size() <= len1 ){
817 score_mat.Append( row2 );
818 back_mat.Append( row );
819 }
820 }
821 for(i=0; i<=len1; i++){
822 if( score_mat[i].Size() < band_width1 ) score_mat[i].Resize( band_width1 );
823 if( back_mat[i].Size() < band_width1 ) back_mat[i].Resize( band_width1 );
824 }
825
826 best_score = 0;
827 /* seq1 is query, seq2 is rep
828 seq2 len2 = 17 seq2 len2 = 17 seq2 len2 = 17
829 01234567890123456 01234567890123456 01234567890123456
830 0 xxxxxxxxxxxxxxxxx \\\\\\XXXxxxxxxxxxxxxxx xXXXXXXXxxxxxxxxx
831 1 \\\\\Xxxxxxxxxxxxxxxxx \\\\\Xxx\xxxxxxxxxxxxx xx\xxxxx\xxxxxxxx
832 2 \\\\X\xxxxxxxxxxxxxxx \\\\Xxxx\xxxxxxxxxxxx xxx\xxxxx\xxxxxxx
833 seq1 3 \\\Xx\xxxxxxxxxxxxxx \\\Xxxxx\xxxxxxxxxxx xxxx\xxxxx\xxxxxx
834 len1 4 \\Xxx\xxxxxxxxxxxxx \\Xxxxxx\xxxxxxxxxx xxxxx\xxxxx\xxxxx
835 = 11 5 \Xxxx\xxxxxxxxxxxx \Xxxxxxx\xxxxxxxxx xxxxxx\xxxxx\xxxx
836 6 Xxxxx\xxxxxxxxxxx Xxxxxxxx\xxxxxxxx xxxxxxx\xxxxx\xxx
837 7 x\xxxx\xxxxxxxxxx x\xxxxxxx\xxxxxxx xxxxxxxx\xxxxx\xx
838 8 xx\xxxx\xxxxxxxxx xx\xxxxxxx\xxxxxx xxxxxxxxx\xxxxx\x
839 9 xxx\xxxx\xxxxxxxx xxx\xxxxxxx\xxxxx xxxxxxxxxx\xxxxx\
840 0 xxxx\xxxx\xxxxxxx xxxx\xxxxxxx\xxxx xxxxxxxxxxx\xxxxx
841 band_left < 0 (-6) band_left < 0 (-6) band_left >=0 (1)
842 band_right < 0 (-1) band_right >=0 (2) band_right >=0(7)
843 band_width 6 band_width 9 band_width 7
844 init score_mat, and iden_mat (place with upper 'X')
845 */
846
847 if (band_left < 0) { //set score to left border of the matrix within band
848 int tband = (band_right < 0) ? band_right : 0;
849 //for (k=band_left; k<tband; k++)
850 for (k=band_left; k<=tband; k++) { // fixed on 2006 11 14
851 i = -k;
852 j1 = k-band_left;
853 // penalty for leading gap opening = penalty for gap extension
854 // each of the left side query hunging residues give ext_gap (-1)
855 score_mat[i][j1] = mat.ext_gap * i;
856 back_mat[i][j1] = DP_BACK_TOP;
857 }
858 back_mat[-tband][tband-band_left] = DP_BACK_NONE;
859 }
860
861 if (band_right >=0) { //set score to top border of the matrix within band
862 int tband = (band_left > 0) ? band_left : 0;
863 for (j=tband; j<=band_right; j++) {
864 j1 = j-band_left;
865 score_mat[0][j1] = mat.ext_gap * j;
866 back_mat[0][j1] = DP_BACK_LEFT;
867 }
868 back_mat[0][tband-band_left] = DP_BACK_NONE;
869 }
870
871 int gap_open[2] = { mat.gap, mat.ext_gap };
872 int max_diag = band_center - band_left;
873 int extra_score[4] = { 4, 3, 2, 1 };
874 for (i=1; i<=len1; i++) {
875 int J0 = 1 - band_left - i;
876 int J1 = len2 - band_left - i;
877 if( J0 < 0 ) J0 = 0;
878 if( J1 >= band_width ) J1 = band_width;
879 for (j1=J0; j1<=J1; j1++){
880 j = j1+i+band_left;
881
882 int ci = iseq1[i-1];
883 int cj = iseq2[j-1];
884 int sij = mat.matrix[ci][cj];
885 //int iden_ij = (ci == cj);
886 int s1, k0, back;
887
888 /* extra score according to the distance to the best diagonal */
889 int extra = extra_score[ abs(j1 - max_diag) & 3 ]; // max distance 3
890 sij += extra * (sij>0);
891
892 back = DP_BACK_LEFT_TOP;
893 best_score1 = score_mat[i-1][j1] + sij;
894 int gap0 = gap_open[ (i == len1) | (j == len2) ];
895 int gap = 0;
896 int64_t score;
897
898 if( j1 > 0 ){
899 gap = gap0;
900 if( back_mat[i][j1-1] == DP_BACK_LEFT ) gap = mat.ext_gap;
901 if( (score = score_mat[i][j1-1] + gap) > best_score1 ){
902 back = DP_BACK_LEFT;
903 best_score1 = score;
904 }
905 }
906 if(j1+1<band_width){
907 gap = gap0;
908 if( back_mat[i-1][j1+1] == DP_BACK_TOP ) gap = mat.ext_gap;
909 if( (score = score_mat[i-1][j1+1] + gap) > best_score1 ){
910 back = DP_BACK_TOP;
911 best_score1 = score;
912 }
913 }
914 score_mat[i][j1] = best_score1;
915 back_mat[i][j1] = back;
916 //printf( "%2i(%2i) ", best_score1, iden_no1 );
917
918 }
919 //printf( "\n" );
920 }
921 i = j = 0;
922 if( len2 - band_left < len1 ){
923 i = len2 - band_left;
924 j = len2;
925 }else if( len1 + band_right < len2 ){
926 i = len1;
927 j = len1 + band_right;
928 }else{
929 i = len1;
930 j = len2;
931 }
932 j1 = j - i - band_left;
933 best_score = score_mat[i][j1];
934 best_score1 = score_mat[i][j1];
935
936 #if 1
937 const char *letters = "acgtnx";
938 const char *letters2 = "ACGTNX";
939 #else
940 const char *letters = "arndcqeghilkmfpstwyvbzx";
941 const char *letters2 = "ARNDCQEGHILKMFPSTWYVBZX";
942 #endif
943 int back = back_mat[i][j1];
944 int last = back;
945 int count = 0, count2 = 0, count3 = 0;
946 int match, begin1, begin2, end1, end2;
947 int gbegin1=0, gbegin2=0, gend1=0, gend2=0;
948 int64_t score, smin = best_score1, smax = best_score1 - 1;
949 int posmin, posmax, pos = 0;
950 int bl, dlen = 0, dcount = 0;
951 posmin = posmax = 0;
952 begin1 = begin2 = end1 = end2 = 0;
953
954 #ifdef PRINT
955 #define PRINT
956 printf( "%i %i\n", best_score, score_mat[i][j1] );
957 printf( "%i %i %i\n", band_left, band_center, band_right );
958 printf( "%i %i %i %i\n", i, j, j1, len2 );
959 #endif
960 #ifdef MAKEALIGN
961 #define MAKEALIGN
962 char AA[ MAX_SEQ ], BB[ MAX_SEQ ];
963 int NN = 0;
964 int IA, IB;
965 for(IA=len1;IA>i;IA--){
966 AA[NN] = letters[ iseq1[IA-1] ];
967 BB[NN++] = '-';
968 }
969 for(IB=len2;IB>j;IB--){
970 AA[NN] = '-';
971 BB[NN++] = letters[ iseq2[IB-1] ];
972 }
973 #endif
974
975 int masked = 0;
976 int indels = 0;
977 int max_indels = 0;
978 while( back != DP_BACK_NONE ){
979 switch( back ){
980 case DP_BACK_TOP :
981 #ifdef PRINT
982 printf( "%5i: %c %c %9i\n", pos, letters[ iseq1[i-1] ], '|', score_mat[i][j1] );
983 #endif
984 #ifdef MAKEALIGN
985 AA[NN] = letters[ iseq1[i-1] ];
986 BB[NN++] = '-';
987 #endif
988 bl = (last != back) & (j != 1) & (j != len2);
989 dlen += bl;
990 dcount += bl;
991 score = score_mat[i][j1];
992 if( score < smin ){
993 count2 = 0;
994 smin = score;
995 posmin = pos - 1;
996 begin1 = i;
997 begin2 = j;
998 }
999 i -= 1;
1000 j1 += 1;
1001 break;
1002 case DP_BACK_LEFT :
1003 #ifdef PRINT
1004 printf( "%5i: %c %c %9i\n", pos, '|', letters[ iseq2[j-1] ], score_mat[i][j1] );
1005 #endif
1006 #ifdef MAKEALIGN
1007 AA[NN] = '-';
1008 BB[NN++] = letters[ iseq2[j-1] ];
1009 #endif
1010 bl = (last != back) & (i != 1) & (i != len1);
1011 dlen += bl;
1012 dcount += bl;
1013 score = score_mat[i][j1];
1014 if( score < smin ){
1015 count2 = 0;
1016 smin = score;
1017 posmin = pos - 1;
1018 begin1 = i;
1019 begin2 = j;
1020 }
1021 j1 -= 1;
1022 j -= 1;
1023 break;
1024 case DP_BACK_LEFT_TOP :
1025 #ifdef PRINT
1026 if( iseq1[i-1] == iseq2[j-1] ){
1027 printf( "%5i: %c %c %9i\n", pos, letters2[ iseq1[i-1] ], letters2[ iseq2[j-1] ], score_mat[i][j1] );
1028 }else{
1029 printf( "%5i: %c %c %9i\n", pos, letters[ iseq1[i-1] ], letters[ iseq2[j-1] ], score_mat[i][j1] );
1030 }
1031 #endif
1032 #ifdef MAKEALIGN
1033 if( iseq1[i-1] == iseq2[j-1] ){
1034 AA[NN] = letters2[ iseq1[i-1] ];
1035 BB[NN++] = letters2[ iseq2[j-1] ];
1036 }else{
1037 AA[NN] = letters[ iseq1[i-1] ];
1038 BB[NN++] = letters[ iseq2[j-1] ];
1039 }
1040 #endif
1041 if( alninfo && options.global_identity ){
1042 if( i == 1 || j == 1 ){
1043 gbegin1 = i-1;
1044 gbegin2 = j-1;
1045 }else if( i == len1 || j == len2 ){
1046 gend1 = i-1;
1047 gend2 = j-1;
1048 }
1049 }
1050 score = score_mat[i][j1];
1051 i -= 1;
1052 j -= 1;
1053 match = iseq1[i] == iseq2[j];
1054 if( score > smax ){
1055 count = 0;
1056 smax = score;
1057 posmax = pos;
1058 end1 = i;
1059 end2 = j;
1060 }
1061 if( options.isEST && (iseq1[i] > 4 || iseq2[j] > 4) ){
1062 masked += 1;
1063 }else{
1064 dlen += 1;
1065 dcount += ! match;
1066 count += match;
1067 count2 += match;
1068 count3 += match;
1069 }
1070 if( score < smin ){
1071 int mm = match == 0;
1072 count2 = 0;
1073 smin = score;
1074 posmin = pos - mm;
1075 begin1 = i + mm;
1076 begin2 = j + mm;
1077 }
1078 break;
1079 default : printf( "%i\n", back ); break;
1080 }
1081 if( options.is454 ){
1082 if( back == DP_BACK_LEFT_TOP ){
1083 if( indels > max_indels ) max_indels = indels;
1084 indels = 0;
1085 }else{
1086 if( last == DP_BACK_LEFT_TOP ){
1087 indels = 1;
1088 }else if( indels ){
1089 indels += 1;
1090 }
1091 }
1092 }
1093 pos += 1;
1094 last = back;
1095 back = back_mat[i][j1];
1096 }
1097 if( options.is454 and max_indels > options.max_indel ) return FAILED_FUNC;
1098 iden_no = options.global_identity ? count3 : count - count2;
1099 alnln = posmin - posmax + 1 - masked;
1100 dist = dcount/(float)dlen;
1101 //dist = - 0.75 * log( 1.0 - dist * 4.0 / 3.0 );
1102 int umtail1 = len1 - 1 - end1;
1103 int umtail2 = len2 - 1 - end2;
1104 int umhead = begin1 < begin2 ? begin1 : begin2;
1105 int umtail = umtail1 < umtail2 ? umtail1 : umtail2;
1106 int umlen = umhead + umtail;
1107 if( umlen > options.unmatch_len ) return FAILED_FUNC;
1108 if( umlen > len1 * options.short_unmatch_per ) return FAILED_FUNC;
1109 if( umlen > len2 * options.long_unmatch_per ) return FAILED_FUNC;
1110 if( alninfo ){
1111 alninfo[0] = begin1;
1112 alninfo[1] = end1;
1113 alninfo[2] = begin2;
1114 alninfo[3] = end2;
1115 alninfo[4] = masked;
1116 if( options.global_identity ){
1117 alninfo[0] = gbegin1;
1118 alninfo[1] = gend1;
1119 alninfo[2] = gbegin2;
1120 alninfo[3] = gend2;
1121 }
1122 }
1123 #ifdef PRINT
1124 printf( "%6i %6i: %4i %4i %4i %4i\n", alnln, iden_no, begin1, end1, begin2, end2 );
1125 printf( "%6i %6i: %4i %4i\n", posmin, posmax, posmin - posmax, count - count2 );
1126 printf( "smin = %9i, smax = %9i\n", smin, smax );
1127 printf( "dlen = %5i, dcount = %5i, dist = %.3f\n", dlen, dcount, dcount/(float)dlen );
1128 #endif
1129 #ifdef MAKEALIGN
1130 float identity = iden_no / (float)( options.global_identity ? (len1 - masked) : alnln);
1131 if( identity < options.cluster_thd ) return OK_FUNC;
1132 while(i--){
1133 AA[NN] = letters[ iseq1[i-1] ];
1134 BB[NN++] = '-';
1135 }
1136 while(j--){
1137 AA[NN] = '-';
1138 BB[NN++] = letters[ iseq2[j-1] ];
1139 }
1140 AA[NN] = '\0';
1141 BB[NN] = '\0';
1142 for(i=0; i<NN/2; i++){
1143 char aa = AA[i], bb = BB[i];
1144 AA[i] = AA[NN-i-1];
1145 BB[i] = BB[NN-i-1];
1146 AA[NN-i-1] = aa;
1147 BB[NN-i-1] = bb;
1148 }
1149 static int fcount = 0;
1150 fcount += 1;
1151 FILE *fout = fopen( "alignments.txt", "a" );
1152 if( fout == NULL ){
1153 if( fcount <= 1 ) printf( "alignment files open failed\n" );
1154 return OK_FUNC;
1155 }
1156 fprintf( fout, "\n\n######################################################\n" );
1157 fprintf( fout, "# length X = %i\n", len2 );
1158 fprintf( fout, "# length Y = %i\n", len1 );
1159 fprintf( fout, "# best align X: %i-%i\n", begin2+1, end2+1 );
1160 fprintf( fout, "# best align Y: %i-%i\n", begin1+1, end1+1 );
1161 if( alninfo ){
1162 fprintf( fout, "# align X: %i-%i\n", alninfo[2]+1, alninfo[3]+1 );
1163 fprintf( fout, "# align Y: %i-%i\n", alninfo[0]+1, alninfo[1]+1 );
1164 }
1165 fprintf( fout, "# alignment length: %i\n", alnln );
1166 fprintf( fout, "# identity count: %i\n", iden_no );
1167 fprintf( fout, "# identity: %g\n", identity );
1168 fprintf( fout, "# distance: %g\n", dist );
1169 if( options.is454 ) fprintf( fout, "# max indel: %i\n", max_indels );
1170 #if 0
1171 fprintf( fout, "%i %s\n", seq1->index, AA );
1172 fprintf( fout, "%i %s\n", seq2->index, BB );
1173 #else
1174 bool printaa = true;
1175 IB = IA = 0;
1176 fprintf( fout, "\n\nX " );
1177 while( IA < NN ){
1178 if( printaa ){
1179 fprintf( fout, "%c", BB[IB] );
1180 IB += 1;
1181 if( IB % 75 ==0 or IB == NN ) printaa = false, fprintf( fout, "\nY " );
1182 }else{
1183 fprintf( fout, "%c", AA[IA] );
1184 IA += 1;
1185 if( IA % 75 ==0 ) printaa = true, fprintf( fout, "\n\nX " );
1186 }
1187 }
1188 #endif
1189 fclose( fout );
1190 #endif
1191
1192 return OK_FUNC;
1193 } // END int local_band_align
1194
1195
1196
setaa_to_na()1197 void setaa_to_na()
1198 {
1199 int i;
1200 for (i=0; i<26; i++) aa2idx[i] = na2idx[i];
1201 } // END void setaa_to_na
1202
1203
1204 /////////////////
ScoreMatrix()1205 ScoreMatrix::ScoreMatrix()
1206 {
1207 init();
1208 }
1209
init()1210 void ScoreMatrix::init()
1211 {
1212 set_gap( -11, -1 );
1213 set_matrix( BLOSUM62 );
1214 }
1215
set_gap(int gap1,int ext_gap1)1216 void ScoreMatrix::set_gap(int gap1, int ext_gap1)
1217 {
1218 int i;
1219 gap = MAX_SEQ * gap1;
1220 ext_gap = MAX_SEQ * ext_gap1;
1221 }
1222
set_matrix(int * mat1)1223 void ScoreMatrix::set_matrix(int *mat1)
1224 {
1225 int i, j, k;
1226 k = 0;
1227 for ( i=0; i<MAX_AA; i++)
1228 for ( j=0; j<=i; j++)
1229 matrix[j][i] = matrix[i][j] = MAX_SEQ * mat1[ k++ ];
1230 }
1231
set_to_na()1232 void ScoreMatrix::set_to_na()
1233 {
1234 set_gap( -6, -1 );
1235 set_matrix( BLOSUM62_na );
1236 }
1237 // Only for est
set_match(int score)1238 void ScoreMatrix::set_match( int score )
1239 {
1240 int i;
1241 for ( i=0; i<5; i++) matrix[i][i] = MAX_SEQ * score;
1242 //matrix[3][4] = matrix[4][3] = MAX_SEQ * score;
1243 }
1244 // Only for est
set_mismatch(int score)1245 void ScoreMatrix::set_mismatch( int score )
1246 {
1247 int i, j;
1248 for ( i=0; i<MAX_AA; i++)
1249 for ( j=0; j<i; j++)
1250 matrix[j][i] = matrix[i][j] = MAX_SEQ * score;
1251 matrix[3][4] = matrix[4][3] = MAX_SEQ;
1252 }
1253
WordTable(int naa,int naan)1254 WordTable::WordTable( int naa, int naan )
1255 {
1256 NAA = 0;
1257 NAAN = 0;
1258 is_aa = 1;
1259 size = 0;
1260 frag_count = 0;
1261 Init( naa, naan );
1262 }
1263
SetDNA()1264 void WordTable::SetDNA()
1265 {
1266 is_aa = 0;
1267 }
1268
Init(int naa,int naan)1269 void WordTable::Init(int naa, int naan)
1270 {
1271 NAA = naa;
1272 NAAN = naan;
1273 indexCounts.resize( NAAN );
1274 }
1275
Clear()1276 void WordTable::Clear()
1277 {
1278 int i;
1279 #if 0
1280 int n1 = 0, n2 = 0, n3 = 0, ns = 0;
1281 for(i=0; i<NAAN; i++){
1282 NVector<IndexCount> & ics = indexCounts[i];
1283 for(int j=0; j<ics.size; j++){
1284 IndexCount ic = ics[j];
1285 n1 += ic.count == 1;
1286 n2 += ic.count == 2;
1287 n3 += ic.count == 3;
1288 ns += ic.count >= 4;
1289 }
1290 }
1291 printf( "%9i %9i %9i %9i\n", n1, n2, n3, ns );
1292 #endif
1293 size = 0;
1294 frag_count = 0;
1295 sequences.clear();
1296 for (i=0; i<NAAN; i++) indexCounts[i].size = 0;//Clear();
1297 }
1298
AddWordCounts(NVector<IndexCount> & counts,Sequence * seq,bool skipN)1299 int WordTable::AddWordCounts( NVector<IndexCount> & counts, Sequence *seq, bool skipN)
1300 {
1301 int aan_no = counts.Size();
1302 int i, j, k, idx = sequences.size();
1303 for (i=0; i<aan_no; i++) {
1304 IndexCount ic = counts[i];
1305 if ( (k=ic.count) ) {
1306 j = ic.index;
1307 if ( skipN && j<0) continue; // for those has 'N'
1308 NVector<IndexCount> & row = indexCounts[j];
1309 ic.index = idx;
1310 row.Append( ic );
1311 size += 1;
1312 }
1313 }
1314 sequences.Append( seq );
1315 return OK_FUNC;
1316 }
AddWordCountsFrag(NVector<IndexCount> & counts,int frag,int frag_size,int repfrag)1317 int WordTable::AddWordCountsFrag( NVector<IndexCount> & counts, int frag, int frag_size, int repfrag )
1318 {
1319 return 0;
1320 }
AddWordCounts(int aan_no,Vector<int> & word_encodes,Vector<INTs> & word_encodes_no,int idx,bool skipN)1321 int WordTable::AddWordCounts(int aan_no, Vector<int> & word_encodes, Vector<INTs> & word_encodes_no, int idx, bool skipN)
1322 {
1323 int i, j, k;
1324 //printf( "seq %6i: ", idx );
1325 for (i=0; i<aan_no; i++) {
1326 if ( (k=word_encodes_no[i]) ) {
1327 j = word_encodes[i];
1328 if( skipN && j<0) continue; // for those has 'N'
1329 NVector<IndexCount> & row = indexCounts[j];
1330 row.Append( IndexCount( idx, k ) );
1331 size += 1;
1332 //if( k >1 ) printf( " %3i", k );
1333 }
1334 }
1335 //printf( "\n" );
1336 return OK_FUNC;
1337 }
1338
AddWordCountsFrag(int aan_no,Vector<int> & word_encodes,Vector<INTs> & word_encodes_no,int frag,int frag_size)1339 int WordTable::AddWordCountsFrag( int aan_no, Vector<int> & word_encodes,
1340 Vector<INTs> & word_encodes_no, int frag, int frag_size )
1341 {
1342 int i, j, k, i1, k1, fra;
1343
1344 for (i=0; i<frag; i++) {
1345 k = (i+1)*frag_size < aan_no ? (i+1)*frag_size-1: aan_no-1;
1346 //quick_sort(&word_encodes[0], i*frag_size, k);
1347 std::sort( word_encodes.begin() + i*frag_size, word_encodes.begin() + k + 1 );
1348 }
1349 for(j=aan_no-1; j; j--) {
1350 if (word_encodes[j] == word_encodes[j-1]) {
1351 word_encodes_no[j-1] += word_encodes_no[j];
1352 word_encodes_no[j]=0;
1353 }
1354 }
1355 // END check_word_encodes
1356
1357 for (i=0; i<aan_no; i+=frag_size) {
1358 k = frag_size < (aan_no-i) ? frag_size : (aan_no -i);
1359 fra = i / frag_size;
1360 //AddWordCounts(k, word_encodes+i, word_encodes_no+i, NR90f_no+fra);
1361 for (i1=i; i1<i+k; i1++) {
1362 if ( (k1=word_encodes_no[i1]) ) {
1363 j = word_encodes[i1];
1364 NVector<IndexCount> & row = indexCounts[j];
1365 row.Append( IndexCount( frag_count + fra, k1 ) );
1366 size += 1;
1367 }
1368 }
1369 }
1370 frag_count += frag;
1371
1372 return 0;
1373 }
1374
1375
PrintAll()1376 void WordTable::PrintAll()
1377 {
1378 int i, j, k;
1379 int cols = 0;
1380 long long total_words = 0;
1381 k = 0;
1382 for (i=0; i<NAAN; i++) {
1383 int size = indexCounts[i].Size();
1384 if ( size == 0 ) continue;
1385 cols++;
1386 cout << k << "\t" << i << "\tsize:" << size << "\t";
1387 for (j=0; j<size; j++) {
1388 cout << indexCounts[i][j].index << "," << indexCounts[i][j].count << " ";
1389 total_words += indexCounts[i][j].count;
1390 }
1391 cout << endl;
1392 k++;
1393 }
1394
1395 cout << "total cols: " << cols << " total words: " << total_words << endl;
1396 }
1397
1398
1399 /* Quick Sort.
1400 * Adam Drozdek: Data Structures and Algorithms in C++, 2nd Edition.
1401 */
PartialQuickSort(IndexCount * data,int first,int last,int partial)1402 void PartialQuickSort( IndexCount *data, int first, int last, int partial )
1403 {
1404 int lower=first+1, upper=last;
1405 IndexCount pivot;
1406 IndexCount val;
1407 if( first >= last ) return;
1408 val = data[first];
1409 data[first] = data[ (first+last)/2 ];
1410 data[ (first+last)/2 ] = val;
1411 pivot = data[ first ];
1412
1413 while( lower <= upper ){
1414 while( lower <= last && data[lower].count < pivot.count ) lower ++;
1415 while( pivot.count < data[upper].count ) upper --;
1416 if( lower < upper ){
1417 val = data[lower];
1418 data[lower] = data[upper];
1419 data[upper] = val;
1420 upper --;
1421 }
1422 lower ++;
1423 }
1424 val = data[first];
1425 data[first] = data[upper];
1426 data[upper] = val;
1427 if( first < upper-1 ) PartialQuickSort( data, first, upper-1, partial );
1428 if( upper >= partial ) return;
1429 if( upper+1 < last ) PartialQuickSort( data, upper+1, last, partial );
1430 }
CountWords(int aan_no,Vector<int> & word_encodes,Vector<INTs> & word_encodes_no,NVector<IndexCount> & lookCounts,NVector<uint32_t> & indexMapping,bool est,int min)1431 int WordTable::CountWords(int aan_no, Vector<int> & word_encodes, Vector<INTs> & word_encodes_no,
1432 NVector<IndexCount> &lookCounts, NVector<uint32_t> & indexMapping,
1433 bool est, int min)
1434 {
1435 int S = frag_count ? frag_count : sequences.size();
1436 int j, k, j0, j1, k1, m;
1437 int ix1, ix2, ix3, ix4;
1438 IndexCount tmp;
1439
1440 IndexCount *ic = lookCounts.items;
1441 for(j=0; j<lookCounts.size; j++, ic++) indexMapping[ ic->index ] = 0;
1442 lookCounts.size = 0;
1443
1444 int *we = & word_encodes[0];
1445 j0 = 0;
1446 if( est ) while( *we <0 ) j0++, we++; // if met short word has 'N'
1447 INTs *wen = & word_encodes_no[j0];
1448 //printf( "\nquery : " );
1449 for (; j0<aan_no; j0++, we++, wen++) {
1450 j = *we;
1451 j1 = *wen;
1452 //if( j1 >1 ) printf( " %3i", j1 );
1453 if( j1==0 ) continue;
1454 NVector<IndexCount> & one = indexCounts[j];
1455 k1 = one.Size();
1456 IndexCount *ic = one.items;
1457
1458 int rest = aan_no - j0 + 1;
1459 for (k=0; k<k1; k++, ic++){
1460 int c = ic->count < j1 ? ic->count : j1;
1461 uint32_t *idm = indexMapping.items + ic->index;
1462 if( *idm ==0 ){
1463 if( rest < min ) continue;
1464 IndexCount *ic2 = lookCounts.items + lookCounts.size;
1465 lookCounts.size += 1;
1466 *idm = lookCounts.size;
1467 ic2->index = ic->index;
1468 ic2->count = c;
1469 }else{
1470 lookCounts[ *idm - 1 ].count += c;
1471 }
1472 }
1473 }
1474 //printf( "%6i %6i\n", S, lookCounts.size );
1475 lookCounts[ lookCounts.size ].count = 0;
1476 //printf( "\n\n" );
1477 return OK_FUNC;
1478 }
1479
Sequence()1480 Sequence::Sequence()
1481 {
1482 memset( this, 0, sizeof( Sequence ) );
1483 distance = 2.0;
1484 }
Sequence(const Sequence & other)1485 Sequence::Sequence( const Sequence & other )
1486 {
1487 int i;
1488 //printf( "new: %p %p\n", this, & other );
1489 memcpy( this, & other, sizeof( Sequence ) );
1490 distance = 2.0;
1491 if( other.data ){
1492 size = bufsize = other.size;
1493 size_R2 = 0;
1494 data = new char[size+1];
1495 //printf( "data: %p %p\n", data, other.data );
1496 data[size] = 0;
1497 memcpy( data, other.data, size );
1498 //for (i=0; i<size; i++) data[i] = other.data[i];
1499 }
1500 if( other.identifier ){
1501 int len = strlen( other.identifier );
1502 identifier = new char[len+1];
1503 memcpy( identifier, other.identifier, len );
1504 identifier[len] = 0;
1505 }
1506 }
1507
1508 // back to back merge for PE
1509 // R1 -> XXXXXXABC ------------------- NMLYYYYYY <--R2
1510 // >R1 >R2
1511 // XXXXXXABC YYYYYYLMN =====> Merge into
1512 // >R12
1513 // NMLYYYYYYXXXXXXABC
Sequence(const Sequence & other,const Sequence & other2,int mode)1514 Sequence::Sequence( const Sequence & other, const Sequence & other2, int mode )
1515 {
1516 int i;
1517 if (mode != 1) bomb_error("unknown mode");
1518
1519 //printf( "new: %p %p\n", this, & other );
1520 memcpy( this, & other, sizeof( Sequence ) );
1521 distance = 2.0;
1522
1523 if( other.data && other2.data ){
1524 size = bufsize = (other.size + other2.size);
1525 size_R2 = other2.size;
1526 data = new char[size+1];
1527 //printf( "data: %p %p\n", data, other.data );
1528 data[size] = 0;
1529 data[size_R2] = 0;
1530 memcpy( data, other2.data, size_R2); // copy R2 first
1531 strrev( data ); // reverse R2 on data
1532 memcpy( data+size_R2, other.data, size-size_R2 ); // copy R1 to end of R2
1533 //for (i=0; i<size; i++) data[i] = other.data[i];
1534 des_begin2 = other2.des_begin;
1535 tot_length2= other2.tot_length;
1536 }
1537 else if ( other.data || other2.data ) {
1538 bomb_error("Not both PE sequences have data");
1539 }
1540
1541 if( other.identifier ){ // only use R1
1542 int len = strlen( other.identifier );
1543 identifier = new char[len+1];
1544 memcpy( identifier, other.identifier, len );
1545 identifier[len] = 0;
1546 }
1547 }
1548
1549
~Sequence()1550 Sequence::~Sequence()
1551 {
1552 //printf( "delete: %p\n", this );
1553 if( data ) delete[] data;
1554 if( identifier ) delete[] identifier;
1555 }
1556
Clear()1557 void Sequence::Clear()
1558 {
1559 if( data ) delete[] data;
1560 /* do not set size to zero here, it is need for writing output */
1561 bufsize = 0;
1562 data = NULL;
1563 }
1564
operator =(const char * s)1565 void Sequence::operator=( const char *s )
1566 {
1567 size = 0; // avoid copying;
1568 Resize( strlen( s ) );
1569 strcpy( data, s );
1570 }
operator +=(const char * s)1571 void Sequence::operator+=( const char *s )
1572 {
1573 int i, m = size, n = strlen( s );
1574 Reserve( m + n );
1575 memcpy( data+m, s, n );
1576 }
Resize(int n)1577 void Sequence::Resize( int n )
1578 {
1579 int i, m = size < n ? size : n;
1580 size = n;
1581 if( size != bufsize ){
1582 char *old = data;
1583 bufsize = size;
1584 data = new char[ bufsize + 1 ];
1585 if ( data == NULL ) bomb_error( "Memory" );
1586 if ( old ){
1587 memcpy( data, old, m );
1588 delete []old;
1589 }
1590 if( size ) data[size] = 0;
1591 }
1592 }
Reserve(int n)1593 void Sequence::Reserve( int n )
1594 {
1595 int i, m = size < n ? size : n;
1596 size = n;
1597 if( size > bufsize ){
1598 char *old = data;
1599 bufsize = size + size/5 + 1;
1600 data = new char[ bufsize + 1 ];
1601 if ( data == NULL ) bomb_error( "Memory" );
1602 if ( old ){
1603 memcpy( data, old, m );
1604 delete []old;
1605 }
1606 }
1607 if( size ) data[size] = 0;
1608 }
trim(int trim_len)1609 void Sequence::trim(int trim_len) {
1610 if (trim_len >= size) return;
1611 size = trim_len;
1612 if (size) data[size]=0;
1613 }
ConvertBases()1614 void Sequence::ConvertBases()
1615 {
1616 int i;
1617 for(i=0; i<size; i++) data[i] = aa2idx[data[i] - 'A'];
1618 }
1619
Swap(Sequence & other)1620 void Sequence::Swap( Sequence & other )
1621 {
1622 Sequence tmp;
1623 memcpy( & tmp, this, sizeof( Sequence ) );
1624 memcpy( this, & other, sizeof( Sequence ) );
1625 memcpy( & other, & tmp, sizeof( Sequence ) );
1626 memset( & tmp, 0, sizeof( Sequence ) );
1627 }
Format()1628 int Sequence::Format()
1629 {
1630 int i, j=0, m = 0;
1631 while( size && isspace( data[size-1] ) ) size --;
1632 if( size && data[size-1] == '*' ) size --;
1633 if( size ) data[size] = 0;
1634 for (i=0; i<size; i++){
1635 char ch = data[i];
1636 m += ! (isalpha( ch ) | isspace( ch ));
1637 }
1638 if( m ) return m;
1639 for (i=0; i<size; i++){
1640 char ch = data[i];
1641 if ( isalpha( ch ) ) data[j++] = toupper( ch );
1642 }
1643 data[j] = 0;
1644 size = j;
1645 return 0;
1646 }
1647
SwapIn()1648 void Sequence::SwapIn()
1649 {
1650 if( data ) return;
1651 if( swap == NULL ) bomb_error( "Can not swap in sequence" );
1652 Resize( size );
1653 fseek( swap, offset, SEEK_SET );
1654 if( fread( data, 1, size, swap ) ==0 ) bomb_error( "Can not swap in sequence" );
1655 data[size] = 0;
1656 }
SwapOut()1657 void Sequence::SwapOut()
1658 {
1659 if( swap && data ){
1660 delete[] data;
1661 bufsize = 0;
1662 data = NULL;
1663 }
1664 }
PrintInfo(int id,FILE * fout,const Options & options,char * buf)1665 void Sequence::PrintInfo( int id, FILE *fout, const Options & options, char *buf )
1666 {
1667 const char *tag = options.isEST ? "nt" : "aa";
1668 bool print = options.print != 0;
1669 bool strand = options.isEST;
1670 fprintf( fout, "%i\t%i%s, >%s...", id, size, tag, identifier+1 );
1671 if( identity ){
1672 int *c = coverage;
1673 fprintf( fout, " at " );
1674 if (print) fprintf( fout, "%i:%i:%i:%i/", c[0], c[1], c[2], c[3] );
1675 if (strand) fprintf( fout, "%c/", (state & IS_MINUS_STRAND) ? '-' : '+' );
1676 fprintf( fout, "%.2f%%", identity*100 );
1677 if( options.useDistance ) fprintf( fout, "/%.2f%%", distance*100 );
1678 fprintf( fout, "\n" );
1679 }else{
1680 fprintf( fout, " *\n" );
1681 }
1682 }
1683
1684 // by liwz gzip version 2019-02
1685 // by liwz
1686 // disable swap option
1687 // change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
1688 // where des_begin is the FILE pointer of sequence record start
1689 // tot_length is the total bytes of sequence record
Readgz(const char * file,const Options & options)1690 void SequenceDB::Readgz( const char *file, const Options & options )
1691 {
1692 #ifdef WITH_ZLIB
1693 Sequence one;
1694 Sequence des;
1695 gzFile fin = gzopen(file, "r");
1696 char *buffer = NULL;
1697 char *res = NULL;
1698 int option_l = options.min_length;
1699 if( fin == NULL ) bomb_error( "Failed to open the database file" );
1700 Clear();
1701 buffer = new char[ MAX_LINE_SIZE+1 ];
1702
1703 while (not gzeof( fin ) || one.size) { /* do not break when the last sequence is not handled */
1704 buffer[0] = '>';
1705 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL && one.size == 0) break;
1706 if( buffer[0] == '+' ){
1707 int len = strlen( buffer );
1708 int len2 = len;
1709 while( len2 && buffer[len2-1] != '\n' ){
1710 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1711 len2 = strlen( buffer );
1712 len += len2;
1713 }
1714 one.tot_length += len;
1715
1716 // read next line quality score
1717 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
1718 len = strlen( buffer );
1719 len2 = len;
1720 while( len2 && buffer[len2-1] != '\n' ){
1721 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1722 len2 = strlen( buffer );
1723 len += len2;
1724 }
1725 one.tot_length += len;
1726 }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
1727 if ( one.size ) { // write previous record
1728 if( one.identifier == NULL || one.Format() ){
1729 printf( "Warning: from file \"%s\",\n", file );
1730 printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1731 if( one.identifier ) printf( "%s\n", one.identifier );
1732 printf( "%s\n", one.data );
1733 one.size = 0;
1734 }
1735 one.index = sequences.size();
1736 if( one.size > option_l ) {
1737 if (options.trim_len > 0) one.trim(options.trim_len);
1738 sequences.Append( new Sequence( one ) );
1739 }
1740 }
1741 one.size = 0;
1742 one.tot_length = 0;
1743
1744 int len = strlen( buffer );
1745 int len2 = len;
1746 des.size = 0;
1747 des += buffer;
1748 while( len2 && buffer[len2-1] != '\n' ){
1749 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1750 des += buffer;
1751 len2 = strlen( buffer );
1752 len += len2;
1753 }
1754 size_t offset = gztell( fin );
1755 one.des_begin = offset - len;
1756 one.tot_length += len; // count first line
1757
1758 int i = 0;
1759 if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
1760 if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
1761 if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
1762 while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
1763 des.data[i] = 0;
1764 one.identifier = des.data;
1765 } else {
1766 one.tot_length += strlen(buffer); one += buffer;
1767 }
1768 }
1769 one.identifier = NULL;
1770 delete[] buffer;
1771 gzclose( fin );
1772 #else
1773 bomb_error("this program was not compiled with zlib");
1774 #endif
1775 }
1776
1777
1778
1779 // by liwz
1780 // disable swap option
1781 // change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
1782 // where des_begin is the FILE pointer of sequence record start
1783 // tot_length is the total bytes of sequence record
Read(const char * file,const Options & options)1784 void SequenceDB::Read( const char *file, const Options & options )
1785 {
1786 int f_len = strlen(file);
1787 if (strcmp(file + f_len - 3, ".gz") == 0 ) {
1788 Readgz(file, options);
1789 return;
1790 }
1791
1792 Sequence one;
1793 Sequence des;
1794 FILE *fin = fopen( file, "rb" );
1795 char *buffer = NULL;
1796 char *res = NULL;
1797 int option_l = options.min_length;
1798 if( fin == NULL ) bomb_error( "Failed to open the database file" );
1799 Clear();
1800 buffer = new char[ MAX_LINE_SIZE+1 ];
1801
1802 while (not feof( fin ) || one.size) { /* do not break when the last sequence is not handled */
1803 buffer[0] = '>';
1804 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL && one.size == 0) break;
1805 if( buffer[0] == '+' ){
1806 int len = strlen( buffer );
1807 int len2 = len;
1808 while( len2 && buffer[len2-1] != '\n' ){
1809 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1810 len2 = strlen( buffer );
1811 len += len2;
1812 }
1813 one.tot_length += len;
1814
1815 // read next line quality score
1816 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) bomb_error("can not read quality score after");
1817 len = strlen( buffer );
1818 len2 = len;
1819 while( len2 && buffer[len2-1] != '\n' ){
1820 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1821 len2 = strlen( buffer );
1822 len += len2;
1823 }
1824 one.tot_length += len;
1825 }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
1826 if ( one.size ) { // write previous record
1827 if( one.identifier == NULL || one.Format() ){
1828 printf( "Warning: from file \"%s\",\n", file );
1829 printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1830 if( one.identifier ) printf( "%s\n", one.identifier );
1831 printf( "%s\n", one.data );
1832 one.size = 0;
1833 }
1834 one.index = sequences.size();
1835 if( one.size > option_l ) {
1836 if (options.trim_len > 0) one.trim(options.trim_len);
1837 sequences.Append( new Sequence( one ) );
1838 }
1839 }
1840 one.size = 0;
1841 one.tot_length = 0;
1842
1843 int len = strlen( buffer );
1844 int len2 = len;
1845 des.size = 0;
1846 des += buffer;
1847 while( len2 && buffer[len2-1] != '\n' ){
1848 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1849 des += buffer;
1850 len2 = strlen( buffer );
1851 len += len2;
1852 }
1853 size_t offset = ftell( fin );
1854 one.des_begin = offset - len;
1855 one.tot_length += len; // count first line
1856
1857 int i = 0;
1858 if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
1859 if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
1860 if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
1861 while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
1862 des.data[i] = 0;
1863 one.identifier = des.data;
1864 } else {
1865 one.tot_length += strlen(buffer); one += buffer;
1866 }
1867 }
1868 #if 0
1869 int i, n = 0;
1870 for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
1871 cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
1872 int i;
1873 scanf( "%i", & i );
1874 #endif
1875 one.identifier = NULL;
1876 delete[] buffer;
1877 fclose( fin );
1878 }
1879
1880
1881 // by liwz gzip version 2019-02
1882 // PE reads liwz, disable swap option
Readgz(const char * file,const char * file2,const Options & options)1883 void SequenceDB::Readgz( const char *file, const char *file2, const Options & options )
1884 {
1885 #ifdef WITH_ZLIB
1886 Sequence one, two;
1887 Sequence des;
1888 gzFile fin = gzopen(file, "r");
1889 gzFile fin2= gzopen(file2,"r");
1890 char *buffer = NULL;
1891 char *buffer2= NULL;
1892 char *res = NULL;
1893 char *res2= NULL;
1894 int option_l = options.min_length;
1895 if( fin == NULL ) bomb_error( "Failed to open the database file" );
1896 if( fin2== NULL ) bomb_error( "Failed to open the database file" );
1897 Clear();
1898 buffer = new char[ MAX_LINE_SIZE+1 ];
1899 buffer2= new char[ MAX_LINE_SIZE+1 ];
1900
1901 while (((not gzeof( fin )) && (not gzeof( fin2)) ) || (one.size && two.size)) { /* do not break when the last sequence is not handled */
1902 buffer[0] = '>'; res =gzgets(fin, buffer, MAX_LINE_SIZE);
1903 buffer2[0]= '>'; res2=gzgets(fin2, buffer2, MAX_LINE_SIZE);
1904
1905 if ( (res == NULL) && (res2 != NULL)) bomb_error( "Paired input files have different number sequences" );
1906 if ( (res != NULL) && (res2 == NULL)) bomb_error( "Paired input files have different number sequences" );
1907 if ( (one.size == 0 ) && (two.size > 0)) bomb_error( "Paired input files have different number sequences" );
1908 if ( (one.size > 0 ) && (two.size == 0)) bomb_error( "Paired input files have different number sequences" );
1909 if ( (res == NULL) && (one.size == 0)) break;
1910
1911 if( buffer[0] == '+' ){ // fastq 3rd line
1912 // file 1
1913 int len = strlen( buffer );
1914 int len2 = len;
1915 while( len2 && buffer[len2-1] != '\n' ){ // read until the end of the line
1916 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1917 len2 = strlen( buffer );
1918 len += len2;
1919 }
1920 one.tot_length += len;
1921
1922 // read next line quality score
1923 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
1924 len = strlen( buffer );
1925 len2 = len;
1926 while( len2 && buffer[len2-1] != '\n' ){
1927 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1928 len2 = strlen( buffer );
1929 len += len2;
1930 }
1931 one.tot_length += len;
1932
1933 // file 2
1934 len = strlen( buffer2 );
1935 len2 = len;
1936 while( len2 && buffer2[len2-1] != '\n' ){ // read until the end of the line
1937 if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
1938 len2 = strlen( buffer2 );
1939 len += len2;
1940 }
1941 two.tot_length += len;
1942
1943 // read next line quality score
1944 if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
1945 len = strlen( buffer2 );
1946 len2 = len;
1947 while( len2 && buffer2[len2-1] != '\n' ){
1948 if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
1949 len2 = strlen( buffer2 );
1950 len += len2;
1951 }
1952 two.tot_length += len;
1953
1954 }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
1955 if ( one.size && two.size ) { // write previous record
1956 if( one.identifier == NULL || one.Format() ){
1957 printf( "Warning: from file \"%s\",\n", file );
1958 printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1959 if( one.identifier ) printf( "%s\n", one.identifier );
1960 printf( "%s\n", one.data );
1961 one.size=0; two.size=0;
1962 }
1963 if( two.identifier == NULL || two.Format() ){
1964 printf( "Warning: from file \"%s\",\n", file2 );
1965 printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1966 if( two.identifier ) printf( "%s\n", two.identifier );
1967 printf( "%s\n", two.data );
1968 one.size=0; two.size = 0;
1969 }
1970 one.index = sequences.size();
1971 if( (one.size + two.size)> option_l ) {
1972 if (options.trim_len > 0) one.trim(options.trim_len);
1973 if (options.trim_len_R2 > 0) two.trim(options.trim_len_R2);
1974 sequences.Append( new Sequence( one, two, 1 ) );
1975 }
1976 }
1977 // R1
1978 one.size = 0;
1979 one.tot_length = 0;
1980
1981 int len = strlen( buffer );
1982 int len2 = len;
1983 des.size = 0;
1984 des += buffer;
1985 while( len2 && buffer[len2-1] != '\n' ){
1986 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1987 des += buffer;
1988 len2 = strlen( buffer );
1989 len += len2;
1990 }
1991 size_t offset = gztell( fin );
1992 one.des_begin = offset - len; // offset of ">" or "@"
1993 one.tot_length += len; // count first line
1994
1995 int i = 0;
1996 if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
1997 if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
1998 if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
1999 while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
2000 des.data[i] = 0; // find first non-space letter
2001 one.identifier = des.data;
2002
2003 // R2
2004 two.size = 0;
2005 two.tot_length = 0;
2006
2007 len = strlen( buffer2 );
2008 len2 = len;
2009 while( len2 && buffer2[len2-1] != '\n' ){
2010 if ( (res=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
2011 len2 = strlen( buffer2 );
2012 len += len2;
2013 }
2014 offset = gztell( fin2 );
2015 two.des_begin = offset - len;
2016 two.tot_length += len; // count first line
2017 two.identifier = des.data;
2018 } else {
2019 one.tot_length += strlen(buffer); one += buffer;
2020 two.tot_length+= strlen(buffer2); two+= buffer2;
2021 }
2022 }
2023 one.identifier = NULL;
2024 two.identifier = NULL;
2025 delete[] buffer;
2026 gzclose( fin );
2027 delete[] buffer2;
2028 gzclose( fin2 );
2029 #else
2030 bomb_error("this program was not compiled with zlib");
2031 #endif
2032
2033 }
2034
2035 // PE reads liwz, disable swap option
Read(const char * file,const char * file2,const Options & options)2036 void SequenceDB::Read( const char *file, const char *file2, const Options & options )
2037 {
2038 int f_len = strlen(file);
2039 int f_len2= strlen(file2);
2040 if (strcmp(file + f_len - 3, ".gz") == 0 ) {
2041 if ( strcmp(file2 + f_len2 - 3, ".gz") ) bomb_error( "Both input files need to be in .gz format" );
2042 Readgz(file, file2, options);
2043 return;
2044 }
2045
2046 Sequence one, two;
2047 Sequence des;
2048 FILE *fin = fopen( file, "rb" );
2049 FILE *fin2= fopen( file2,"rb" );
2050 char *buffer = NULL;
2051 char *buffer2= NULL;
2052 char *res = NULL;
2053 char *res2= NULL;
2054 int option_l = options.min_length;
2055 if( fin == NULL ) bomb_error( "Failed to open the database file" );
2056 if( fin2== NULL ) bomb_error( "Failed to open the database file" );
2057 Clear();
2058 buffer = new char[ MAX_LINE_SIZE+1 ];
2059 buffer2= new char[ MAX_LINE_SIZE+1 ];
2060
2061 while (((not feof( fin )) && (not feof( fin2)) ) || (one.size && two.size)) { /* do not break when the last sequence is not handled */
2062 buffer[0] = '>'; res =fgets( buffer, MAX_LINE_SIZE, fin );
2063 buffer2[0]= '>'; res2=fgets( buffer2, MAX_LINE_SIZE, fin2 );
2064
2065 if ( (res == NULL) && (res2 != NULL)) bomb_error( "Paired input files have different number sequences" );
2066 if ( (res != NULL) && (res2 == NULL)) bomb_error( "Paired input files have different number sequences" );
2067 if ( (one.size == 0 ) && (two.size > 0)) bomb_error( "Paired input files have different number sequences" );
2068 if ( (one.size > 0 ) && (two.size == 0)) bomb_error( "Paired input files have different number sequences" );
2069 if ( (res == NULL) && (one.size == 0)) break;
2070
2071 if( buffer[0] == '+' ){ // fastq 3rd line
2072 // file 1
2073 int len = strlen( buffer );
2074 int len2 = len;
2075 while( len2 && buffer[len2-1] != '\n' ){ // read until the end of the line
2076 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
2077 len2 = strlen( buffer );
2078 len += len2;
2079 }
2080 one.tot_length += len;
2081
2082 // read next line quality score
2083 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) bomb_error("can not read quality score after");
2084 len = strlen( buffer );
2085 len2 = len;
2086 while( len2 && buffer[len2-1] != '\n' ){
2087 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
2088 len2 = strlen( buffer );
2089 len += len2;
2090 }
2091 one.tot_length += len;
2092
2093 // file 2
2094 len = strlen( buffer2 );
2095 len2 = len;
2096 while( len2 && buffer2[len2-1] != '\n' ){ // read until the end of the line
2097 if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
2098 len2 = strlen( buffer2 );
2099 len += len2;
2100 }
2101 two.tot_length += len;
2102
2103 // read next line quality score
2104 if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) bomb_error("can not read quality score after");
2105 len = strlen( buffer2 );
2106 len2 = len;
2107 while( len2 && buffer2[len2-1] != '\n' ){
2108 if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
2109 len2 = strlen( buffer2 );
2110 len += len2;
2111 }
2112 two.tot_length += len;
2113
2114 }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
2115 if ( one.size && two.size ) { // write previous record
2116 if( one.identifier == NULL || one.Format() ){
2117 printf( "Warning: from file \"%s\",\n", file );
2118 printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
2119 if( one.identifier ) printf( "%s\n", one.identifier );
2120 printf( "%s\n", one.data );
2121 one.size=0; two.size=0;
2122 }
2123 if( two.identifier == NULL || two.Format() ){
2124 printf( "Warning: from file \"%s\",\n", file2 );
2125 printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
2126 if( two.identifier ) printf( "%s\n", two.identifier );
2127 printf( "%s\n", two.data );
2128 one.size=0; two.size = 0;
2129 }
2130 one.index = sequences.size();
2131 if( (one.size + two.size)> option_l ) {
2132 if (options.trim_len > 0) one.trim(options.trim_len);
2133 if (options.trim_len_R2 > 0) two.trim(options.trim_len_R2);
2134 sequences.Append( new Sequence( one, two, 1 ) );
2135 }
2136 }
2137 // R1
2138 one.size = 0;
2139 one.tot_length = 0;
2140
2141 int len = strlen( buffer );
2142 int len2 = len;
2143 des.size = 0;
2144 des += buffer;
2145 while( len2 && buffer[len2-1] != '\n' ){
2146 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
2147 des += buffer;
2148 len2 = strlen( buffer );
2149 len += len2;
2150 }
2151 size_t offset = ftell( fin );
2152 one.des_begin = offset - len; // offset of ">" or "@"
2153 one.tot_length += len; // count first line
2154
2155 int i = 0;
2156 if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
2157 if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
2158 if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
2159 while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
2160 des.data[i] = 0; // find first non-space letter
2161 one.identifier = des.data;
2162
2163 // R2
2164 two.size = 0;
2165 two.tot_length = 0;
2166
2167 len = strlen( buffer2 );
2168 len2 = len;
2169 while( len2 && buffer2[len2-1] != '\n' ){
2170 if ( (res=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
2171 len2 = strlen( buffer2 );
2172 len += len2;
2173 }
2174 offset = ftell( fin2 );
2175 two.des_begin = offset - len;
2176 two.tot_length += len; // count first line
2177 two.identifier = des.data;
2178 } else {
2179 one.tot_length += strlen(buffer); one += buffer;
2180 two.tot_length+= strlen(buffer2); two+= buffer2;
2181 }
2182 }
2183 #if 0
2184 int i, n = 0;
2185 for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
2186 cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
2187 int i;
2188 scanf( "%i", & i );
2189 #endif
2190 one.identifier = NULL;
2191 two.identifier = NULL;
2192 delete[] buffer;
2193 fclose( fin );
2194 delete[] buffer2;
2195 fclose( fin2 );
2196 }
2197
2198 #if 0
2199 void SequenceDB::Sort( int first, int last )
2200 {
2201 int lower=first+1, upper=last;
2202 Sequence *pivot;
2203 Sequence *val;
2204 if( first >= last ) return;
2205 val = sequences[first];
2206 sequences[first] = sequences[ (first+last)/2 ];
2207 sequences[ (first+last)/2 ] = val;
2208 pivot = sequences[ first ];
2209
2210 while( lower <= upper ){
2211 while( lower <= last && sequences[lower]->stats < pivot->stats ) lower ++;
2212 while( pivot->stats < sequences[upper]->stats ) upper --;
2213 if( lower < upper ){
2214 val = sequences[lower];
2215 sequences[lower] = sequences[upper];
2216 sequences[upper] = val;
2217 upper --;
2218 }
2219 lower ++;
2220 }
2221 val = sequences[first];
2222 sequences[first] = sequences[upper];
2223 sequences[upper] = val;
2224 if( first < upper-1 ) Sort( first, upper-1 );
2225 if( upper+1 < last ) Sort( upper+1, last );
2226 }
2227 #endif
SortDivide(Options & options,bool sort)2228 void SequenceDB::SortDivide( Options & options, bool sort )
2229 {
2230 int i, j, k, len;
2231 int N = sequences.size();
2232 total_letter=0;
2233 total_desc=0;
2234 max_len = 0;
2235 min_len = (size_t)-1;
2236 for (i=0; i<N; i++) {
2237 Sequence *seq = sequences[i];
2238 len = seq->size;
2239 total_letter += len;
2240 if (len > max_len) max_len = len;
2241 if (len < min_len) min_len = len;
2242 if (seq->swap == NULL) seq->ConvertBases();
2243 if( seq->identifier ) total_desc += strlen( seq->identifier );
2244 }
2245 options.max_entries = max_len * MAX_TABLE_SEQ;
2246 if (max_len >= 65536 and sizeof(INTs) <=2)
2247 bomb_warning("Some seqs longer than 65536, you may define LONG_SEQ");
2248
2249 if (max_len > MAX_SEQ )
2250 bomb_warning("Some seqs are too long, please rebuild the program with make parameter "
2251 "MAX_SEQ=new-maximum-length (e.g. make MAX_SEQ=10000000)");
2252
2253 cout << "longest and shortest : " << max_len << " and " << min_len << endl;
2254 cout << "Total letters: " << total_letter << endl;
2255 // END change all the NR_seq to iseq
2256
2257 len_n50 = (max_len + min_len) / 2; // will be properly set, if sort is true;
2258 if( sort ){
2259 // **************************** Form NR_idx[], Sort them from Long to short
2260 long long sum = 0;
2261 int M = max_len - min_len + 1;
2262 Vector<int> count( M, 0 ); // count for each size = max_len - i
2263 Vector<int> accum( M, 0 ); // count for all size > max_len - i
2264 Vector<int> offset( M, 0 ); // offset from accum[i] when filling sorting
2265 Vector<Sequence*> sorting( N ); // TODO: use a smaller class if this consumes to much memory!
2266
2267 for (i=0; i<N; i++) count[ max_len - sequences[i]->size ] ++;
2268 for (i=1; i<M; i++) accum[i] = accum[i-1] + count[i-1];
2269 for (i=0; i<M; i++){
2270 sum += (max_len - i) * count[i];
2271 if( sum >= (total_letter>>1) ){
2272 len_n50 = max_len - i;
2273 break;
2274 }
2275 }
2276 for (i=0; i<N; i++){
2277 int len = max_len - sequences[i]->size;
2278 int id = accum[len] + offset[len];
2279 //sequences[i].index = id;
2280 sorting[id] = sequences[i];
2281 offset[len] ++;
2282 }
2283 options.max_entries = 0;
2284 for (i=0; i<N; i++){
2285 sequences[i] = sorting[i];
2286 if( i < MAX_TABLE_SEQ ) options.max_entries += sequences[i]->size;
2287 }
2288 #if 0
2289 if( options.isEST ){
2290 int start = 0;
2291 for (i=0; i<M; i++){
2292 Sort( start, accum[i] );
2293 start = accum[i];
2294 }
2295 }
2296 #endif
2297 cout << "Sequences have been sorted" << endl;
2298 // END sort them from long to short
2299 }
2300 }// END sort_seqs_divide_segs
2301
DivideSave(const char * db,const char * newdb,int n,const Options & options)2302 void SequenceDB::DivideSave( const char *db, const char *newdb, int n, const Options & options )
2303 {
2304 if( n == 0 or sequences.size() ==0 ) return;
2305
2306 size_t max_seg = total_letter / n + sequences[0]->size;
2307 if( max_seg >= MAX_BIN_SWAP ) max_seg = (size_t) MAX_BIN_SWAP;
2308
2309 FILE *fin = fopen( db, "rb" );
2310 char *buf = new char[MAX_LINE_SIZE+1];
2311 char outfile[512];
2312 size_t seg_size = 0;
2313 int i, j, count, rest, seg = 0;
2314 sprintf( outfile, "%s-%i", newdb, 0 );
2315 FILE *fout = fopen( outfile, "w+" );
2316 n = sequences.size();
2317 for (i=0; i<n; i++){
2318 Sequence *seq = sequences[i];
2319 fseek( fin, seq->des_begin, SEEK_SET );
2320
2321 seg_size += seq->size;
2322 if( seg_size >= max_seg ){
2323 seg += 1;
2324 sprintf( outfile, "%s-%i", newdb, seg );
2325 fclose( fout );
2326 fout = fopen( outfile, "w+" );
2327 seg_size = seq->size;
2328 }
2329
2330 count = seq->tot_length / MAX_LINE_SIZE;
2331 rest = seq->tot_length % MAX_LINE_SIZE;
2332 //printf( "count = %6i, rest = %6i\n", count, rest );
2333 for (j=0; j<count; j++){
2334 if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2335 fwrite( buf, 1, MAX_LINE_SIZE, fout );
2336 }
2337 if( rest ){
2338 if( fread( buf, 1, rest, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2339 fwrite( buf, 1, rest, fout );
2340 }
2341 }
2342 fclose( fin );
2343 fclose( fout );
2344 delete []buf;
2345 }
2346
2347 // input db is gzipped
WriteClustersgz(const char * db,const char * newdb,const Options & options)2348 void SequenceDB::WriteClustersgz( const char *db, const char *newdb, const Options & options )
2349 {
2350 #ifdef WITH_ZLIB
2351 gzFile fin = gzopen(db, "r");
2352 FILE *fout = fopen( newdb, "w+" );
2353 int i, j, n = rep_seqs.size();
2354 int count, rest;
2355 char *buf = new char[MAX_LINE_SIZE+1];
2356 vector<uint64_t> sorting( n );
2357 if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
2358 for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
2359 std::sort( sorting.begin(), sorting.end() );
2360 for (i=0; i<n; i++){
2361 Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2362 gzseek( fin, seq->des_begin, SEEK_SET );
2363
2364 count = seq->tot_length / MAX_LINE_SIZE;
2365 rest = seq->tot_length % MAX_LINE_SIZE;
2366 //printf( "count = %6i, rest = %6i\n", count, rest );
2367 for (j=0; j<count; j++){
2368 if( gzread(fin, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
2369 fwrite( buf, 1, MAX_LINE_SIZE, fout );
2370 }
2371 if( rest ){
2372 if( gzread(fin, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
2373 fwrite( buf, 1, rest, fout );
2374 }
2375 }
2376 gzclose( fin );
2377 fclose( fout );
2378 delete []buf;
2379 #else
2380 bomb_error("this program was not compiled with zlib");
2381 #endif
2382
2383 }
2384
WriteClusters(const char * db,const char * newdb,const Options & options)2385 void SequenceDB::WriteClusters( const char *db, const char *newdb, const Options & options )
2386 {
2387 int f_len = strlen(db);
2388 if (strcmp(db + f_len - 3, ".gz") == 0 ) {
2389 WriteClustersgz(db, newdb, options);
2390 return;
2391 }
2392
2393 FILE *fin = fopen( db, "rb" );
2394 FILE *fout = fopen( newdb, "w+" );
2395 int i, j, n = rep_seqs.size();
2396 int count, rest;
2397 char *buf = new char[MAX_LINE_SIZE+1];
2398 vector<uint64_t> sorting( n );
2399 if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
2400 for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
2401 std::sort( sorting.begin(), sorting.end() );
2402 for (i=0; i<n; i++){
2403 Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2404 fseek( fin, seq->des_begin, SEEK_SET );
2405
2406 count = seq->tot_length / MAX_LINE_SIZE;
2407 rest = seq->tot_length % MAX_LINE_SIZE;
2408 //printf( "count = %6i, rest = %6i\n", count, rest );
2409 for (j=0; j<count; j++){
2410 if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2411 fwrite( buf, 1, MAX_LINE_SIZE, fout );
2412 }
2413 if( rest ){
2414 if( fread( buf, 1, rest, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2415 fwrite( buf, 1, rest, fout );
2416 }
2417 }
2418 fclose( fin );
2419 fclose( fout );
2420 delete []buf;
2421 }
2422
2423
2424 // input db is gzipped
2425 // liwz PE output
WriteClustersgz(const char * db,const char * db_pe,const char * newdb,const char * newdb_pe,const Options & options)2426 void SequenceDB::WriteClustersgz( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options )
2427 {
2428 #ifdef WITH_ZLIB
2429 gzFile fin = gzopen(db, "r");
2430 gzFile fin_pe = gzopen(db_pe, "r");
2431 FILE *fout = fopen( newdb, "w+" );
2432 FILE *fout_pe = fopen( newdb_pe, "w+" );
2433 int i, j, n = rep_seqs.size();
2434 int count, rest;
2435 char *buf = new char[MAX_LINE_SIZE+1];
2436 vector<uint64_t> sorting( n );
2437 if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
2438 if( fin_pe == NULL || fout_pe == NULL ) bomb_error( "file opening failed" );
2439 for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
2440 std::sort( sorting.begin(), sorting.end() );
2441
2442 //sort fasta / fastq
2443 int *clstr_size;
2444 int *clstr_idx1;
2445 if (options.sort_outputf) {
2446 clstr_size = new int[n];
2447 clstr_idx1 = new int[n];
2448 for (i=0; i<n; i++) {
2449 clstr_size[i] = 0;
2450 clstr_idx1[i] = i;
2451 }
2452
2453 int N = sequences.size();
2454 for (i=0; i<N; i++) {
2455 int id = sequences[i]->cluster_id;
2456 if (id < 0) continue;
2457 if (id >=n) continue;
2458 clstr_size[id]++;
2459 }
2460 quick_sort_idxr(clstr_size, clstr_idx1, 0, n-1);
2461 }
2462
2463 for (i=0; i<n; i++){
2464 Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2465 if (options.sort_outputf) seq = sequences[ rep_seqs[ clstr_idx1[i] ] ];
2466 //R1
2467 gzseek( fin, seq->des_begin, SEEK_SET );
2468
2469 count = seq->tot_length / MAX_LINE_SIZE;
2470 rest = seq->tot_length % MAX_LINE_SIZE;
2471 //printf( "count = %6i, rest = %6i\n", count, rest );
2472 for (j=0; j<count; j++){
2473 if( gzread(fin, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
2474 fwrite( buf, 1, MAX_LINE_SIZE, fout );
2475 }
2476 if( rest ){
2477 if( gzread(fin, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
2478 fwrite( buf, 1, rest, fout );
2479 }
2480
2481 //R2
2482 gzseek( fin_pe, seq->des_begin2, SEEK_SET );
2483
2484 count = seq->tot_length2 / MAX_LINE_SIZE;
2485 rest = seq->tot_length2 % MAX_LINE_SIZE;
2486 //printf( "count = %6i, rest = %6i\n", count, rest );
2487 for (j=0; j<count; j++){
2488 if( gzread(fin_pe, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
2489 fwrite( buf, 1, MAX_LINE_SIZE, fout_pe );
2490 }
2491 if( rest ){
2492 if( gzread(fin_pe, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
2493 fwrite( buf, 1, rest, fout_pe );
2494 }
2495
2496 }
2497 gzclose( fin );
2498 gzclose( fin_pe );
2499 fclose( fout );
2500 fclose( fout_pe );
2501 delete []buf;
2502 #else
2503 bomb_error("this program was not compiled with zlib");
2504 #endif
2505 }
2506
2507
2508 // liwz PE output
WriteClusters(const char * db,const char * db_pe,const char * newdb,const char * newdb_pe,const Options & options)2509 void SequenceDB::WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options )
2510 {
2511 int f_len = strlen(db);
2512 if (strcmp(db + f_len - 3, ".gz") == 0 ) {
2513 WriteClustersgz(db, db_pe, newdb, newdb_pe, options);
2514 return;
2515 }
2516
2517 FILE *fin = fopen( db, "rb" );
2518 FILE *fout = fopen( newdb, "w+" );
2519 FILE *fin_pe = fopen( db_pe, "rb" );
2520 FILE *fout_pe = fopen( newdb_pe, "w+" );
2521 int i, j, n = rep_seqs.size();
2522 int count, rest;
2523 char *buf = new char[MAX_LINE_SIZE+1];
2524 vector<uint64_t> sorting( n );
2525 if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
2526 if( fin_pe == NULL || fout_pe == NULL ) bomb_error( "file opening failed" );
2527 for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
2528 std::sort( sorting.begin(), sorting.end() );
2529
2530 //sort fasta / fastq
2531 int *clstr_size;
2532 int *clstr_idx1;
2533 if (options.sort_outputf) {
2534 clstr_size = new int[n];
2535 clstr_idx1 = new int[n];
2536 for (i=0; i<n; i++) {
2537 clstr_size[i] = 0;
2538 clstr_idx1[i] = i;
2539 }
2540
2541 int N = sequences.size();
2542 for (i=0; i<N; i++) {
2543 int id = sequences[i]->cluster_id;
2544 if (id < 0) continue;
2545 if (id >=n) continue;
2546 clstr_size[id]++;
2547 }
2548 quick_sort_idxr(clstr_size, clstr_idx1, 0, n-1);
2549 }
2550
2551 for (i=0; i<n; i++){
2552 Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2553 if (options.sort_outputf) seq = sequences[ rep_seqs[ clstr_idx1[i] ] ];
2554 //R1
2555 fseek( fin, seq->des_begin, SEEK_SET );
2556
2557 count = seq->tot_length / MAX_LINE_SIZE;
2558 rest = seq->tot_length % MAX_LINE_SIZE;
2559 //printf( "count = %6i, rest = %6i\n", count, rest );
2560 for (j=0; j<count; j++){
2561 if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2562 fwrite( buf, 1, MAX_LINE_SIZE, fout );
2563 }
2564 if( rest ){
2565 if( fread( buf, 1, rest, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2566 fwrite( buf, 1, rest, fout );
2567 }
2568
2569 //R2
2570 fseek( fin_pe, seq->des_begin2, SEEK_SET );
2571
2572 count = seq->tot_length2 / MAX_LINE_SIZE;
2573 rest = seq->tot_length2 % MAX_LINE_SIZE;
2574 //printf( "count = %6i, rest = %6i\n", count, rest );
2575 for (j=0; j<count; j++){
2576 if( fread( buf, 1, MAX_LINE_SIZE, fin_pe ) ==0 ) bomb_error( "Can not swap in sequence" );
2577 fwrite( buf, 1, MAX_LINE_SIZE, fout_pe );
2578 }
2579 if( rest ){
2580 if( fread( buf, 1, rest, fin_pe ) ==0 ) bomb_error( "Can not swap in sequence" );
2581 fwrite( buf, 1, rest, fout_pe );
2582 }
2583
2584 }
2585 fclose( fin );
2586 fclose( fout );
2587 fclose( fin_pe );
2588 fclose( fout_pe );
2589 delete []buf;
2590 }
2591
WriteExtra1D(const Options & options)2592 void SequenceDB::WriteExtra1D( const Options & options )
2593 {
2594 string db_clstr = options.output + ".clstr";
2595 string db_clstr_bak = options.output + ".bak.clstr";
2596 int i, i0, k, N = sequences.size();
2597 vector<long long> sorting( N );
2598 for (i=0; i<N; i++) sorting[i] = ((long long)sequences[i]->index << 32) | i;
2599 std::sort( sorting.begin(), sorting.end() );
2600
2601 FILE *fout;
2602 char *buf = new char[ MAX_DES + 1 ];
2603
2604 if( options.backupFile ){
2605 fout = fopen( db_clstr_bak.c_str(), "w+" );
2606 for (i=0; i<N; i++) {
2607 Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2608 seq->PrintInfo( seq->cluster_id, fout, options, buf );
2609 }
2610 fclose( fout );
2611 }
2612
2613 cout << "writing clustering information" << endl;
2614 int M = rep_seqs.size();
2615 Vector<Vector<int> > clusters( M );
2616 for (i=0; i<N; i++){
2617 int k = sorting[i] & 0xffffffff;
2618 int id = sequences[k]->cluster_id;
2619 clusters[id].Append( k );
2620 }
2621
2622 fout = fopen( db_clstr.c_str(), "w+" );
2623
2624 if (options.sort_output) {
2625 int *clstr_size = new int[M];
2626 int *clstr_idx1 = new int[M];
2627
2628 for (i=0; i<M; i++) {
2629 clstr_size[i] = (int)clusters[i].size();
2630 clstr_idx1[i] = i;
2631 }
2632 quick_sort_idxr(clstr_size, clstr_idx1, 0, M-1);
2633
2634 for (i=0; i<M; i++) {
2635 i0 = clstr_idx1[i];
2636 fprintf( fout, ">Cluster %i\n", i );
2637 for (k=0; k<(int)clusters[i0].size(); k++)
2638 sequences[ clusters[i0][k] ]->PrintInfo( k, fout, options, buf );
2639 }
2640 }
2641 else {
2642 for (i=0; i<M; i++) {
2643 fprintf( fout, ">Cluster %i\n", i );
2644 for (k=0; k<(int)clusters[i].size(); k++)
2645 sequences[ clusters[i][k] ]->PrintInfo( k, fout, options, buf );
2646 }
2647
2648 }
2649
2650 delete []buf;
2651 }
WriteExtra2D(SequenceDB & other,const Options & options)2652 void SequenceDB::WriteExtra2D( SequenceDB & other, const Options & options )
2653 {
2654 string db_clstr = options.output + ".clstr";
2655 string db_clstr_bak = options.output + ".bak.clstr";
2656 int i, k, N = other.sequences.size();
2657 int N2 = sequences.size();
2658 vector<long long> sorting( N );
2659 for (i=0; i<N; i++) sorting[i] = ((long long)other.sequences[i]->index << 32) | i;
2660 std::sort( sorting.begin(), sorting.end() );
2661
2662 FILE *fout;
2663 char *buf = new char[ MAX_DES + 1 ];
2664 if( options.backupFile ){
2665 fout = fopen( db_clstr_bak.c_str(), "w+" );
2666 for (i=0; i<N; i++) {
2667 Sequence *seq = other.sequences[ sorting[i] & 0xffffffff ];
2668 seq->PrintInfo( seq->cluster_id, fout, options, buf );
2669 }
2670 for (i=0; i<N2; i++) {
2671 Sequence *seq = sequences[i];
2672 if( seq->state & IS_REDUNDANT ) seq->PrintInfo( seq->cluster_id, fout, options, buf );
2673 }
2674 fclose( fout );
2675 }
2676
2677 cout << "writing clustering information" << endl;
2678 Vector<Vector<int> > clusters( N );
2679 for (i=0; i<N2; i++){
2680 int id = sequences[i]->cluster_id;
2681 if( sequences[i]->state & IS_REDUNDANT ) clusters[id].Append( i );
2682 }
2683
2684 fout = fopen( db_clstr.c_str(), "w+" );
2685 for (i=0; i<N; i++) {
2686 Sequence *seq = other.sequences[ i ];
2687 fprintf( fout, ">Cluster %i\n", i );
2688 seq->PrintInfo( 0, fout, options, buf );
2689 for (k=0; k<(int)clusters[i].size(); k++)
2690 sequences[ clusters[i][k] ]->PrintInfo( k+1, fout, options, buf );
2691 }
2692 delete []buf;
2693 }
ControlShortCoverage(int len,const Options & options)2694 void WorkingParam::ControlShortCoverage( int len, const Options & options )
2695 {
2696 len_eff = len;
2697 aln_cover_flag = 0;
2698 if ((options.short_coverage > 0.0) || (options.min_control>0) ) { // has alignment coverage control
2699 aln_cover_flag = 1;
2700 min_aln_lenS = (int) (double(len) * options.short_coverage);
2701 if ( len-options.short_control > min_aln_lenS) min_aln_lenS = len-options.short_control;
2702 if ( options.min_control > min_aln_lenS) min_aln_lenS = options.min_control;
2703 }
2704 if (options.global_identity == 0) len_eff = min_aln_lenS; //global_identity==0
2705 }
ControlLongCoverage(int len2,const Options & options)2706 void WorkingParam::ControlLongCoverage( int len2, const Options & options )
2707 {
2708 if (aln_cover_flag) {
2709 min_aln_lenL = (int) (double(len2) * options.long_coverage);
2710 if ( len2-options.long_control > min_aln_lenL) min_aln_lenL = len2-options.long_control;
2711 if ( options.min_control > min_aln_lenL) min_aln_lenL = options.min_control;
2712 }
2713 }
2714
2715
2716 // when alignment coverage such as -aL is specified
2717 // if a existing rep is too long, it won't be qulified
upper_bound_length_rep(int len,double opt_s,int opt_S,double opt_aL,int opt_AL)2718 int upper_bound_length_rep(int len, double opt_s, int opt_S, double opt_aL, int opt_AL )
2719 {
2720 int len_upper_bound = 99999999;
2721 double r1 = (opt_s > opt_aL) ? opt_s : opt_aL;
2722 int a2 = (opt_S < opt_AL) ? opt_S : opt_AL;
2723 if (r1 > 0.0) len_upper_bound = (int) ( ((float) len) / r1);
2724 if ((len+a2) < len_upper_bound) len_upper_bound = len+a2;
2725
2726 return len_upper_bound;
2727 } // END upper_bound_length_rep
upper_bound_length_rep(int len,const Options & options)2728 int upper_bound_length_rep(int len, const Options & options )
2729 {
2730 double opt_s = options.diff_cutoff;
2731 int opt_S = options.diff_cutoff_aa;
2732 double opt_aL = options.long_coverage;
2733 int opt_AL = options.long_control;
2734 return upper_bound_length_rep( len, opt_s, opt_S, opt_aL, opt_AL );
2735 }
2736
2737
cal_aax_cutoff(double & aa1_cutoff,double & aa2_cutoff,double & aan_cutoff,double cluster_thd,int tolerance,int naa_stat_start_percent,int naa_stat[5][61][4],int NAA)2738 void cal_aax_cutoff(double &aa1_cutoff, double &aa2_cutoff, double &aan_cutoff,
2739 double cluster_thd, int tolerance, int naa_stat_start_percent,
2740 int naa_stat[5][61][4], int NAA)
2741 {
2742 aa1_cutoff = cluster_thd;
2743 aa2_cutoff = 1 - (1-cluster_thd)*2;
2744 aan_cutoff = 1 - (1-cluster_thd)*NAA;
2745 if (tolerance==0) return;
2746
2747 int clstr_idx = (int) (cluster_thd * 100) - naa_stat_start_percent;
2748 if (clstr_idx <0) clstr_idx = 0;
2749 double d2 = ((double) (naa_stat[tolerance-1][clstr_idx][3] )) / 100;
2750 double dn = ((double) (naa_stat[tolerance-1][clstr_idx][5-NAA] )) / 100;
2751 aa2_cutoff = d2 > aa2_cutoff ? d2 : aa2_cutoff;
2752 aan_cutoff = dn > aan_cutoff ? dn : aan_cutoff;
2753 return;
2754 } // END cal_aax_cutoff
2755
2756
update_aax_cutoff(double & aa1_cutoff,double & aa2_cutoff,double & aan_cutoff,int tolerance,int naa_stat_start_percent,int naa_stat[5][61][4],int NAA,double cluster_thd)2757 void update_aax_cutoff(double &aa1_cutoff, double &aa2_cutoff, double &aan_cutoff,
2758 int tolerance, int naa_stat_start_percent,
2759 int naa_stat[5][61][4], int NAA, double cluster_thd)
2760 {
2761 if (cluster_thd > 1.0) cluster_thd = 1.00;
2762
2763 double aa1_t, aa2_t, aan_t;
2764 cal_aax_cutoff(aa1_t, aa2_t, aan_t, cluster_thd, tolerance, naa_stat_start_percent,
2765 naa_stat, NAA);
2766 if (aa1_t > aa1_cutoff) aa1_cutoff = aa1_t;
2767 if (aa2_t > aa2_cutoff) aa2_cutoff = aa2_t;
2768 if (aan_t > aan_cutoff) aan_cutoff = aan_t;
2769 return;
2770 } // END update_aax_cutoff
2771
ComputeRequiredBases(int NAA,int ss,const Options & option)2772 void WorkingParam::ComputeRequiredBases( int NAA, int ss, const Options & option )
2773 {
2774 // d: distance, fraction of errors;
2775 // e: number of errors;
2776 // g: length of the maximum gap;
2777 // m: word length;
2778 // n: sequence length;
2779 // alignment length = n - g + 1;
2780 // d = e / (n - g + 1);
2781 // e >= 1, so that, g <= n + 1 - 1/d
2782 // word count = (n - g - m + 1) - (e - 1)*m;
2783 // = (n - g - m + 1) - (d*(n - g + 1) - 1)*m
2784 // = (n - g + 1) - d*m*(n - g + 1)
2785 // = (n - g + 1)*(1 - d*m)
2786 // minimum word count is reached when g == n + 1 - 1/d
2787 // so, minimum word count = 1/d - m.
2788 // if g == band_width: word count = (n - band + 1)*(1 - d*m);
2789 if( options.useDistance ){
2790 int band = options.band_width + 1;
2791 int invd = int( 1.0 / (options.distance_thd + 1E-9) );
2792 int k = len_eff < invd ? len_eff : invd;
2793 int ks = len_eff - ss + 1;
2794 int kn = len_eff - NAA + 1;
2795 int ks2 = invd - ss;
2796 int kn2= invd - NAA;
2797 int ks3 = int((len_eff - band + 1.0)*(1.0 - options.distance_thd * ss));
2798 int kn3 = int((len_eff - band + 1.0)*(1.0 - options.distance_thd * NAA));
2799 //if( ks3 > ks2 ) ks2 = ks3;
2800 //if( kn3 > kn2 ) kn2 = kn3;
2801 required_aa1 = required_aas = (ks2 < ks ? ks2 : ks);
2802 required_aan = kn2 < kn ? kn2 : kn;
2803 if( required_aa1 <=0 ) required_aa1 = required_aas = 1;
2804 if( required_aan <=0 ) required_aan = 1;
2805 //required_aa1 = required_aas = required_aan = 0;
2806 return;
2807 }
2808 // (N-K)-K*(1-C)*N = C*K*N-(K-1)*N-K = (C*K-K+1)*N-K
2809 required_aa1 = (len_eff - ss) - int(ss * ceil( (1.0 - aa1_cutoff) * len_eff ));
2810 if( required_aa1 < 0 ) required_aa1 = 0;
2811 required_aas = required_aa1;
2812 required_aan = (len_eff - NAA) - int(NAA * ceil( (1.0 - aa1_cutoff) * len_eff ));
2813 //printf( "%i %i\n", required_aa1, required_aan );
2814 if( required_aan < 0 ) required_aan = 0;
2815
2816 int aa1_old = int (aa1_cutoff* (double) len_eff) - ss + 1;
2817 int aas_old = int (aas_cutoff* (double) len_eff);
2818 int aan_old = int (aan_cutoff* (double) len_eff);
2819
2820 double thd = option.cluster_thd;
2821 //double rest = (len_eff - ss) / double(len_eff * ss);
2822 double rest = (len_eff - NAA) / double(len_eff * NAA);
2823 double thd0 = 1.0 - rest;
2824 double fnew = 0;
2825 double fold = 1;
2826 if( thd > thd0 ){
2827 fnew = (thd - thd0) / rest;
2828 fold = 1.0 - fnew;
2829 }
2830 //printf( "%g %g %g\n", thd, thd0, fnew );
2831
2832 required_aa1 = (int)(fnew*required_aa1 + fold*aa1_old);
2833 required_aas = (int)(fnew*required_aas + fold*aas_old);
2834 required_aan = (int)(fnew*required_aan + fold*aan_old);
2835 }
EncodeWords(Sequence * seq,int NAA,bool est)2836 int WorkingBuffer::EncodeWords( Sequence *seq, int NAA, bool est )
2837 {
2838 char *seqi = seq->data;
2839 int len = seq->size;
2840 // check_word_encodes
2841 int aan_no = len - NAA + 1;
2842 int i, j, i0, i1;
2843 int skip = 0;
2844 unsigned char k, k1;
2845 for (j=0; j<aan_no; j++) {
2846 char *word = seqi + j;
2847 int encode = 0;
2848 for (k=0, k1=NAA-1; k<NAA; k++, k1--) encode += word[k] * NAAN_array[k1];
2849 word_encodes[j] = word_encodes_backup[j] = encode;
2850 }
2851
2852 if( est ){
2853 for (j=0; j<len; j++){
2854 if ( seqi[j] >= 4 ) { // here N is 4
2855 i0 = (j-NAA+1 > 0) ? j-NAA+1 : 0;
2856 i1 = j < aan_no ? j : aan_no - 1;
2857 for (i=i0; i<=i1; i++) word_encodes[i]=-1;
2858 }
2859 }
2860 for (j=0; j<aan_no; j++) skip += (word_encodes[j] == -1);
2861 }
2862
2863 std::sort( word_encodes.begin(), word_encodes.begin() + aan_no );
2864 for(j=0; j<aan_no; j++) word_encodes_no[j]=1;
2865 for(j=aan_no-1; j; j--) {
2866 if (word_encodes[j] == word_encodes[j-1]) {
2867 word_encodes_no[j-1] += word_encodes_no[j];
2868 word_encodes_no[j]=0;
2869 }
2870 }
2871 return skip;
2872 // END check_word_encodes
2873 }
ComputeAAP(const char * seqi,int size)2874 void WorkingBuffer::ComputeAAP( const char *seqi, int size )
2875 {
2876 int len1 = size - 1;
2877 int sk, j1, mm, c22;
2878 for (sk=0; sk<NAA2; sk++) taap[sk] = 0;
2879 for (j1=0; j1<len1; j1++) {
2880 c22= seqi[j1]*NAA1 + seqi[j1+1];
2881 taap[c22]++;
2882 }
2883 for (sk=0,mm=0; sk<NAA2; sk++) {
2884 aap_begin[sk] = mm; mm+=taap[sk]; taap[sk] = 0;
2885 }
2886 for (j1=0; j1<len1; j1++) {
2887 c22= seqi[j1]*NAA1 + seqi[j1+1];
2888 aap_list[aap_begin[c22]+taap[c22]++] =j1;
2889 }
2890 }
ComputeAAP2(const char * seqi,int size)2891 void WorkingBuffer::ComputeAAP2( const char *seqi, int size )
2892 {
2893 int len1 = size - 3;
2894 int sk, j1, mm, c22;
2895 for (sk=0; sk<NAA4; sk++) taap[sk] = 0;
2896 for (j1=0; j1<len1; j1++) {
2897 if ((seqi[j1]>=4) || (seqi[j1+1]>=4) || (seqi[j1+2]>=4) || (seqi[j1+3]>=4)) continue; //skip N
2898 c22 = seqi[j1]*NAA3 + seqi[j1+1]*NAA2 + seqi[j1+2]*NAA1 + seqi[j1+3];
2899 taap[c22]++;
2900 }
2901 for (sk=0,mm=0; sk<NAA4; sk++) {
2902 aap_begin[sk] = mm; mm += taap[sk]; taap[sk] = 0;
2903 }
2904 for (j1=0; j1<len1; j1++) {
2905 if ((seqi[j1]>=4) || (seqi[j1+1]>=4) || (seqi[j1+2]>=4) || (seqi[j1+3]>=4)) continue; //skip N
2906 c22 = seqi[j1]*NAA3 + seqi[j1+1]*NAA2 + seqi[j1+2]*NAA1 + seqi[j1+3];
2907 aap_list[aap_begin[c22]+taap[c22]++] =j1;
2908 }
2909 }
2910
ClusterOne(Sequence * seq,int id,WordTable & table,WorkingParam & param,WorkingBuffer & buffer,const Options & options)2911 void SequenceDB::ClusterOne( Sequence *seq, int id, WordTable & table,
2912 WorkingParam & param, WorkingBuffer & buffer, const Options & options )
2913 {
2914 if (seq->state & IS_REDUNDANT) return;
2915 int frag_size = options.frag_size;
2916 int NAA = options.NAA;
2917 int len = seq->size;
2918 int len_bound = upper_bound_length_rep(len, options);
2919 param.len_upper_bound = len_bound;
2920 int flag = CheckOne( seq, table, param, buffer, options );
2921
2922 if( flag == 0 ){
2923 if ((seq->identity>0) && (options.cluster_best)) {
2924 // because of the -g option, this seq is similar to seqs in old SEGs
2925 seq->state |= IS_REDUNDANT ;
2926 seq->Clear();
2927 } else { // else add to NR90 db
2928 int aan_no = len - NAA + 1;
2929 int size = rep_seqs.size();
2930 rep_seqs.Append( id );
2931 seq->cluster_id = size;
2932 seq->identity = 0;
2933 seq->state |= IS_REP;
2934 if (frag_size){ /* not used for EST */
2935 int frg1 = (len - NAA ) / frag_size + 1;
2936 table.AddWordCountsFrag( aan_no, buffer.word_encodes_backup,
2937 buffer.word_encodes_no, frg1, frag_size );
2938 }else{
2939 table.AddWordCounts(aan_no, buffer.word_encodes, buffer.word_encodes_no, table.sequences.size(), options.isEST);
2940 }
2941 table.sequences.Append( seq );
2942 if( frag_size ){
2943 while( table.sequences.size() < table.frag_count )
2944 table.sequences.Append( seq );
2945 }
2946 }
2947 }
2948 if ( (id+1) % 1000 == 0 ) {
2949 int size = rep_seqs.size();
2950 printf( "." );
2951 fflush( stdout );
2952 if ( (id+1) % 10000 == 0 ) printf( "\r..........%9i finished %9i clusters\n", id+1, size );
2953 }
2954 }
2955 #include<assert.h>
MinimalMemory(int frag_no,int bsize,int T,const Options & options,size_t extra)2956 size_t SequenceDB::MinimalMemory( int frag_no, int bsize, int T, const Options & options, size_t extra )
2957 {
2958 int N = sequences.size();
2959 int F = frag_no < MAX_TABLE_SEQ ? frag_no : MAX_TABLE_SEQ;
2960 size_t mem_need = 0;
2961 size_t mem, mega = 1000000;
2962 int table = T > 1 ? 2 : 1;
2963
2964 printf( "\nApproximated minimal memory consumption:\n" );
2965 mem = N*sizeof(Sequence) + total_desc + N + extra;
2966 if( options.store_disk == false ) mem += total_letter + N;
2967 printf( "%-16s: %zuM\n", "Sequence", mem/mega );
2968 mem_need += mem;
2969
2970 mem = bsize;
2971 printf( "%-16s: %i X %zuM = %zuM\n", "Buffer", T, mem/mega, T*mem/mega );
2972 mem_need += T*mem;
2973
2974 mem = F*(sizeof(Sequence*) + sizeof(IndexCount)) + NAAN*sizeof(NVector<IndexCount>);
2975 printf( "%-16s: %i X %zuM = %zuM\n", "Table", table, mem/mega, table*mem/mega );
2976 mem_need += table*mem;
2977
2978 mem = sequences.capacity()*sizeof(Sequence*) + N*sizeof(int);
2979 mem += Comp_AAN_idx.size()*sizeof(int);
2980 printf( "%-16s: %zuM\n", "Miscellaneous", mem/mega );
2981 mem_need += mem;
2982
2983 printf( "%-16s: %zuM\n\n", "Total", mem_need/mega );
2984
2985 if(options.max_memory and options.max_memory < mem_need + 50*table ){
2986 char msg[200];
2987 sprintf( msg, "not enough memory, please set -M option greater than %zu\n",
2988 50*table + mem_need/mega );
2989 bomb_error(msg);
2990 }
2991 return mem_need;
2992 }
MemoryLimit(size_t mem_need,const Options & options)2993 size_t MemoryLimit( size_t mem_need, const Options & options )
2994 {
2995 size_t mem_limit = (options.max_memory - mem_need) / sizeof(IndexCount);
2996
2997 //printf( "Table limit with the given memory limit:\n" );
2998 if( options.max_memory == 0 ){
2999 mem_limit = options.max_entries;
3000 if( mem_limit > MAX_TABLE_SIZE ) mem_limit = MAX_TABLE_SIZE;
3001 }
3002 //printf( "Max number of representatives: %zu\n", mem_limit );
3003 //printf( "Max number of word counting entries: %zu\n\n", mem_limit );
3004 return mem_limit;
3005 }
ComputeTableLimits(int min_len,int max_len,int typical_len,size_t mem_need)3006 void Options::ComputeTableLimits( int min_len, int max_len, int typical_len, size_t mem_need )
3007 {
3008 //liwz Fri Jan 15 15:44:47 PST 2016
3009 //T=1 scale=1
3010 //T=2 scale=0.6035
3011 //T=4 scale=0.375
3012 //T=8 scale=0.2392
3013 //T=16 scale=0.1562
3014 //T=32 scale=0.104
3015 //T=64 scale=0.0703
3016
3017 double scale = 0.5/threads + 0.5/sqrt(threads);
3018 max_sequences = (size_t)(scale * MAX_TABLE_SEQ);
3019 max_entries = (size_t)(scale * (500*max_len + 500000*typical_len + 50000000));
3020 if( max_memory ){
3021 double frac = max_sequences / (double) max_entries;
3022 max_entries = (options.max_memory - mem_need) / sizeof(IndexCount);
3023 max_sequences = (size_t)(max_entries * frac);
3024 if( max_sequences < MAX_TABLE_SEQ / 100 ) max_sequences = MAX_TABLE_SEQ / 100;
3025 if( max_sequences > MAX_TABLE_SEQ ) max_sequences = MAX_TABLE_SEQ;
3026 }
3027 printf( "Table limit with the given memory limit:\n" );
3028 printf( "Max number of representatives: %zu\n", max_sequences );
3029 printf( "Max number of word counting entries: %zu\n\n", max_entries );
3030 }
DoClustering(int T,const Options & options)3031 void SequenceDB::DoClustering( int T, const Options & options )
3032 {
3033 int i, j, k;
3034 int NAA = options.NAA;
3035 double aa1_cutoff = options.cluster_thd;
3036 double aas_cutoff = 1 - (1-options.cluster_thd)*4;
3037 double aan_cutoff = 1 - (1-options.cluster_thd)*options.NAA;
3038 int seq_no = sequences.size();
3039 int frag_no = seq_no;
3040 int frag_size = options.frag_size;
3041 int len, len_bound;
3042 int flag;
3043 valarray<size_t> letters(T);
3044
3045 //printf( "%li\n", options.mem_limit );
3046
3047 if (frag_size){
3048 frag_no = 0;
3049 for (i=0; i<seq_no; i++) frag_no += (sequences[i]->size - NAA) / frag_size + 1;
3050 }
3051
3052 if( not options.isEST )
3053 cal_aax_cutoff(aa1_cutoff, aas_cutoff, aan_cutoff, options.cluster_thd,
3054 options.tolerance, naa_stat_start_percent, naa_stat, NAA);
3055
3056 Vector<WorkingParam> params(T);
3057 Vector<WorkingBuffer> buffers(T);
3058 for(i=0; i<T; i++){
3059 params[i].Set( aa1_cutoff, aas_cutoff, aan_cutoff );
3060 buffers[i].Set( frag_no, max_len, options );
3061 }
3062
3063 // word_table as self comparing table and table buffer:
3064 WordTable word_table( options.NAA, NAAN );
3065
3066 WordTable last_table( options.NAA, NAAN );
3067
3068 int N = sequences.size();
3069 int K = N - 100 * T;
3070 size_t mem_need = MinimalMemory( frag_no, buffers[0].total_bytes, T, options );
3071 size_t mem_limit = MemoryLimit( mem_need, options );
3072 size_t mem, mega = 1000000;
3073 size_t tabsize = 0;
3074 int remaining = 0;
3075
3076 Options opts( options );
3077 opts.ComputeTableLimits( min_len, max_len, len_n50, mem_need );
3078
3079 omp_set_num_threads(T);
3080 for(i=0; i<N; ){
3081 int start = i;
3082 int m = i;
3083 size_t sum = remaining;
3084 float redundancy = (rep_seqs.size() + 1.0) / (i + 1.0);
3085 size_t max_items = opts.max_entries;
3086 size_t max_seqs = opts.max_sequences;
3087 size_t items = 0;
3088 if( i == 0 && max_seqs > 1000 ){ // first SCB with small size
3089 max_items /= 8;
3090 max_seqs /= 8;
3091 }
3092 while( m < N && (sum*redundancy) < max_seqs && items < max_items ){
3093 Sequence *seq = sequences[m];
3094 if( ! (seq->state & IS_REDUNDANT) ){
3095 if ( options.store_disk ) seq->SwapIn();
3096 //items += seq->size;
3097 items += (size_t)(seq->size * redundancy);
3098 sum += 1;
3099 }
3100 m ++;
3101 }
3102 if( (m > i + 1E4) && (m > i + (N - i) / (2+T)) ) m = i + (N - i) / (2+T);
3103 if( m == i || m >= N ){
3104 m = N;
3105 if( m > i + 1E3 ) m = i + (N - i) / (2+T);
3106 }
3107 //printf( "m = %i %i, %i\n", i, m, m-i );
3108 printf( "\r# comparing sequences from %9i to %9i\n", i, m );
3109 if( last_table.size ){
3110 int print = (m-i)/20 + 1;
3111 #pragma omp parallel for schedule( dynamic, 1 )
3112 for(int j=i; j<m; j++){
3113 Sequence *seq = sequences[j];
3114 if (seq->state & IS_REDUNDANT) continue;
3115 int tid = omp_get_thread_num();
3116 CheckOne( seq, last_table, params[tid], buffers[tid], options );
3117 if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3118 if( j%print==0 ){
3119 printf( "." ); fflush( stdout );
3120 }
3121 }
3122 int may_stop = 0;
3123 int self_stop = 0;
3124 float p0 = 0;
3125 int min = last_table.sequences[ last_table.sequences.size()-1 ]->size;
3126 int m0 = m;
3127 bool stop = false;
3128 #pragma omp parallel for schedule( dynamic, 1 )
3129 for(int j=m-1; j<N; j++){
3130 #pragma omp flush (stop)
3131 if( ! stop ){
3132 if( j+1 == N ) may_stop = 1;
3133 if( j == (m0-1) ){ // use m0 to avoid other iterations satisfying the condition:
3134 int tid = omp_get_thread_num();
3135 for(int ks=i; ks<m; ks++){
3136 Sequence *seq = sequences[ks];
3137 i = ks + 1;
3138 if (seq->state & IS_REDUNDANT) continue;
3139 ClusterOne( seq, ks, word_table, params[tid], buffers[tid], options );
3140 if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3141 if( may_stop and word_table.sequences.size() >= 100 ) break;
3142 if( word_table.size >= max_items ) break;
3143 int tmax = max_seqs - (frag_size ? seq->size / frag_size + 1 : 0);
3144 if( word_table.sequences.size() >= tmax ) break;
3145 }
3146 self_stop = 1;
3147 }else{
3148 Sequence *seq = sequences[j];
3149 if (seq->state & IS_REDUNDANT) continue;
3150 if ( options.store_disk ){
3151 #pragma omp critical
3152 seq->SwapIn();
3153 }
3154 int tid = omp_get_thread_num();
3155 CheckOne( seq, last_table, params[tid], buffers[tid], options );
3156 if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3157 if( min > params[tid].len_upper_bound ){
3158 may_stop = 1;
3159 stop = true;
3160 #pragma omp flush (stop)
3161 }
3162 if( self_stop && tid ==1 ){
3163 float p = (100.0*j)/N;
3164 if( p > p0+1E-1 ){ // print only if the percentage changed
3165 printf( "\r%4.1f%%", p );
3166 fflush( stdout );
3167 p0 = p;
3168 }
3169 }
3170
3171 }
3172 }
3173 }
3174 }
3175 if( i == start || m == N ){
3176 //printf( "comparing the first or last or very small group ...\n" ); fflush( stdout );
3177 for(k=i; k<m; ){
3178 int kk, mm = k, sum = 0;
3179 while( mm < m && sum < 1E5 ){
3180 if( ! (sequences[mm]->state & IS_REDUNDANT) ) sum += sequences[mm]->size;
3181 mm += 1;
3182 }
3183 if( mm < k + 1000 ) mm = k + 1000;
3184 if( mm > m ) mm = m;
3185 #pragma omp parallel for schedule( dynamic, 1 )
3186 for(kk=k; kk<mm; kk++){
3187 Sequence *seq = sequences[kk];
3188 if (seq->state & IS_REDUNDANT) continue;
3189 int tid = omp_get_thread_num();
3190 CheckOne( seq, word_table, params[tid], buffers[tid], options );
3191 if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3192 }
3193 bool bk = false;
3194 for(int ks=k; ks<mm; ks++){
3195 Sequence *seq = sequences[ks];
3196 i = k = ks + 1;
3197 if (seq->state & IS_REDUNDANT) continue;
3198 ClusterOne( seq, ks, word_table, params[0], buffers[0], options );
3199 bk = true;
3200 if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3201 if( word_table.size >= max_items ) break;
3202 int tmax = max_seqs - (frag_size ? seq->size / frag_size + 1 : 0);
3203 if( word_table.sequences.size() >= tmax ) break;
3204 bk = false;
3205 }
3206 if( bk ) break;
3207 }
3208 }else if( i < m ){
3209 remaining = remaining/2 + (m - i);
3210 printf( "\r---------- %6i remaining sequences to the next cycle\n", m-i );
3211 }
3212 printf( "---------- new table with %8i representatives\n", word_table.sequences.size() );
3213 if( (last_table.size + word_table.size) > tabsize )
3214 tabsize = last_table.size + word_table.size;
3215 last_table.Clear();
3216 last_table.sequences.swap( word_table.sequences );
3217 last_table.indexCounts.swap( word_table.indexCounts );
3218 last_table.size = word_table.size;
3219 word_table.size = 0;
3220 }
3221 printf( "\n%9i finished %9i clusters\n", sequences.size(), rep_seqs.size() );
3222 mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
3223 printf( "\nApproximated maximum memory consumption: %zuM\n", mem );
3224 last_table.Clear();
3225 word_table.Clear();
3226 }
3227
CheckOne(Sequence * seq,WordTable & table,WorkingParam & param,WorkingBuffer & buf,const Options & options)3228 int SequenceDB::CheckOne( Sequence *seq, WordTable & table, WorkingParam & param, WorkingBuffer & buf, const Options & options )
3229 {
3230 int len = seq->size;
3231 param.len_upper_bound = upper_bound_length_rep(len, options);
3232 if( options.isEST ) return CheckOneEST( seq, table, param, buf, options );
3233 return CheckOneAA( seq, table, param, buf, options );
3234 }
CheckOneAA(Sequence * seq,WordTable & table,WorkingParam & param,WorkingBuffer & buf,const Options & options)3235 int SequenceDB::CheckOneAA( Sequence *seq, WordTable & table, WorkingParam & param, WorkingBuffer & buf, const Options & options )
3236 {
3237 NVector<IndexCount> & lookCounts = buf.lookCounts;
3238 NVector<uint32_t> & indexMapping = buf.indexMapping;
3239 Vector<INTs> & word_encodes_no = buf.word_encodes_no;
3240 Vector<INTs> & aap_list = buf.aap_list;
3241 Vector<INTs> & aap_begin = buf.aap_begin;
3242 Vector<int> & word_encodes = buf.word_encodes;
3243 Vector<int> & taap = buf.taap;
3244 double aa1_cutoff = param.aa1_cutoff;
3245 double aa2_cutoff = param.aas_cutoff;
3246 double aan_cutoff = param.aan_cutoff;
3247
3248 char *seqi = seq->data;
3249 int j, k, j1, len = seq->size;
3250 int flag = 0;
3251 int frag_size = options.frag_size;
3252 int & aln_cover_flag = param.aln_cover_flag;
3253 int & required_aa1 = param.required_aa1;
3254 int & required_aa2 = param.required_aas;
3255 int & required_aan = param.required_aan;
3256 int & min_aln_lenS = param.min_aln_lenS;
3257 int & min_aln_lenL = param.min_aln_lenL;
3258
3259 int NAA = options.NAA;
3260 int S = table.sequences.size();
3261 int len_eff = len;
3262
3263 if( S ){
3264 int min = table.sequences[S-1]->size;
3265 if( min < len ){
3266 if( len * options.diff_cutoff2 > min ) min = (int)(len * options.diff_cutoff2);
3267 if( (len - options.diff_cutoff_aa2) > min ) min = len - options.diff_cutoff_aa2;
3268 len_eff = min;
3269 }
3270 }
3271
3272 //liwz 2016 01, seq is too short for the shortest (longer) seq in word_table to satisfy -aL option
3273 //longer seqeunce * -aL -band_width
3274 if ( S ) {
3275 int min = table.sequences[S-1]->size;
3276 int min_red = min * options.long_coverage - options.band_width;
3277 if (len < min_red) return 0; // return flag=0
3278 }
3279
3280 param.ControlShortCoverage( len_eff, options );
3281 param.ComputeRequiredBases( options.NAA, 2, options );
3282
3283 buf.EncodeWords( seq, options.NAA, false );
3284
3285 // if minimal alignment length > len, return
3286 // I can not return earlier, because I need to calc the word_encodes etc
3287 if (options.min_control>len) return 0; // return flag=0
3288
3289 // lookup_aan
3290 int aan_no = len - options.NAA + 1;
3291 int M = frag_size ? table.frag_count : S;
3292 table.CountWords(aan_no, word_encodes, word_encodes_no, lookCounts, indexMapping, false, required_aan);
3293
3294 // contained_in_old_lib()
3295 int len_upper_bound = param.len_upper_bound;
3296 int len_lower_bound = param.len_lower_bound;
3297 int band_left, band_right, best_score, band_width1, best_sum, len2, alnln, len_eff1;
3298 int tiden_no, band_center;
3299 float tiden_pc, distance=0;
3300 int talign_info[5];
3301 int best1, sum;
3302 INTs *lookptr;
3303 char *seqj;
3304 int frg2 = frag_size ? (len - NAA + options.band_width ) / frag_size + 1 + 1 : 0;
3305 int lens;
3306 int has_aa2 = 0;
3307
3308 IndexCount *ic = lookCounts.items;
3309 ic = lookCounts.items;
3310 for(; ic->count; ic++){
3311 if( ! frag_size ){
3312 indexMapping[ ic->index ] = 0;
3313 if ( ic->count < required_aan ) continue;
3314 }
3315
3316 Sequence *rep = table.sequences[ ic->index ];
3317 len2 = rep->size;
3318 if (len2 > len_upper_bound ) continue;
3319 if (options.has2D && len2 < len_lower_bound ) continue;
3320 if( frag_size ){
3321 uint32_t *ims = & indexMapping[ ic->index ];
3322 int count = ic->count;
3323 k = (len2 - NAA) / frag_size + 1;
3324 sum = 0;
3325 for (j1=0; j1<frg2; j1++){
3326 uint32_t im = ims[j1];
3327 if( im ) sum += lookCounts[im-1].count;
3328 }
3329 count = sum;
3330 for (j1=frg2; j1<k; j1++) {
3331 uint32_t im1 = ims[j1];
3332 uint32_t im2 = ims[j1-frg2];
3333 if( im1 ) sum += lookCounts[im1-1].count;
3334 if( im2 ) sum -= lookCounts[im2-1].count;
3335 if (sum > count) count = sum;
3336 }
3337 if ( count < required_aan ) continue;
3338 }
3339
3340 param.ControlLongCoverage( len2, options );
3341
3342 if ( has_aa2 == 0 ) { // calculate AAP array
3343 buf.ComputeAAP( seqi, seq->size );
3344 has_aa2 = 1;
3345 }
3346 seqj = rep->data; //NR_seq[NR90_idx[j]];
3347
3348 band_width1 = (options.band_width < len+len2-2 ) ? options.band_width : len+len2-2;
3349 diag_test_aapn(NAA1, seqj, len, len2, buf, best_sum,
3350 band_width1, band_left, band_center, band_right, required_aa1);
3351 if ( best_sum < required_aa2 ) continue;
3352
3353 int rc = FAILED_FUNC;
3354 if (options.print || aln_cover_flag) //return overlap region
3355 rc = local_band_align(seqi, seqj, len, len2, mat,
3356 best_score, tiden_no, alnln, distance, talign_info,
3357 band_left, band_center, band_right, buf);
3358 else
3359 rc = local_band_align(seqi, seqj, len, len2, mat,
3360 best_score, tiden_no, alnln, distance, talign_info,
3361 band_left, band_center, band_right, buf);
3362 if ( rc == FAILED_FUNC ) continue;
3363 if ( tiden_no < required_aa1 ) continue;
3364 lens = len;
3365 if( options.has2D && len > len2 ) lens = len2;
3366 len_eff1 = (options.global_identity == 0) ? alnln : (lens - talign_info[4]);
3367 tiden_pc = tiden_no / (float) len_eff1;
3368 if( options.useDistance ){
3369 if (distance > options.distance_thd ) continue;
3370 if (distance >= seq->distance) continue; // existing distance
3371 }else{
3372 if (tiden_pc < options.cluster_thd) continue;
3373 if (tiden_pc <= seq->identity) continue; // existing iden_no
3374 }
3375 if (aln_cover_flag) {
3376 if ( talign_info[3]-talign_info[2]+1 < min_aln_lenL) continue;
3377 if ( talign_info[1]-talign_info[0]+1 < min_aln_lenS) continue;
3378 }
3379 if( options.has2D ) seq->state |= IS_REDUNDANT ;
3380 flag = 1; seq->identity = tiden_pc; seq->cluster_id = rep->cluster_id;
3381 seq->distance = distance;
3382 seq->coverage[0] = talign_info[0] +1;
3383 seq->coverage[1] = talign_info[1] +1;
3384 seq->coverage[2] = talign_info[2] +1;
3385 seq->coverage[3] = talign_info[3] +1;
3386 if (not options.cluster_best) break;
3387 update_aax_cutoff(aa1_cutoff, aa2_cutoff, aan_cutoff,
3388 options.tolerance, naa_stat_start_percent, naa_stat, NAA, tiden_pc);
3389 param.ComputeRequiredBases( options.NAA, 2, options );
3390 }
3391 if( frag_size ) ic = lookCounts.items;
3392 while( ic->count ){
3393 indexMapping[ ic->index ] = 0;
3394 ic += 1;
3395 }
3396 lookCounts.size = 0;
3397 if (flag == 1) { // if similar to old one delete it
3398 if (! options.cluster_best) {
3399 seq->Clear();
3400 seq->state |= IS_REDUNDANT ;
3401 }
3402 }
3403 return flag;
3404
3405 }
CheckOneEST(Sequence * seq,WordTable & table,WorkingParam & param,WorkingBuffer & buf,const Options & options)3406 int SequenceDB::CheckOneEST( Sequence *seq, WordTable & table, WorkingParam & param, WorkingBuffer & buf, const Options & options )
3407 {
3408 NVector<IndexCount> & lookCounts = buf.lookCounts;
3409 NVector<uint32_t> & indexMapping = buf.indexMapping;
3410 Vector<INTs> & word_encodes_no = buf.word_encodes_no;
3411 Vector<INTs> & aap_list = buf.aap_list;
3412 Vector<INTs> & aap_begin = buf.aap_begin;
3413 Vector<int> & word_encodes = buf.word_encodes;
3414 Vector<int> & taap = buf.taap;
3415 Vector<int> & aan_list_comp = buf.aan_list_comp;
3416 char *seqi_comp = & buf.seqi_comp[0];
3417
3418 int & aln_cover_flag = param.aln_cover_flag;
3419 int & required_aa1 = param.required_aa1;
3420 int & required_aas = param.required_aas;
3421 int & required_aan = param.required_aan;
3422 int & min_aln_lenS = param.min_aln_lenS;
3423 int & min_aln_lenL = param.min_aln_lenL;
3424
3425 char *seqi = seq->data;
3426 int j, len = seq->size;
3427 int flag = 0;
3428 int S = table.sequences.size();
3429 int len_eff = len;
3430 if( S ){
3431 int min = table.sequences[S-1]->size;
3432 if( min < len ){
3433 if( len * options.diff_cutoff2 > min ) min = (int)(len * options.diff_cutoff2);
3434 if( (len - options.diff_cutoff_aa2) > min ) min = len - options.diff_cutoff_aa2;
3435 len_eff = min;
3436 }
3437 }
3438
3439
3440 //liwz 2016 01, seq is too short for the shortest (longer) seq in word_table to satisfy -aL option
3441 //longer seqeunce * -aL -band_width
3442 if ( S ) {
3443 int min = table.sequences[S-1]->size;
3444 int min_red = min * options.long_coverage - options.band_width;
3445 if (len < min_red) return 0; // return flag=0
3446 }
3447
3448
3449 param.ControlShortCoverage( len_eff, options );
3450 param.ComputeRequiredBases( options.NAA, 4, options );
3451 int skip = buf.EncodeWords( seq, options.NAA, true );
3452 required_aan -= skip;
3453 required_aas -= skip;
3454 required_aa1 -= skip;
3455 if( required_aan <= 0 ) required_aan = 1;
3456 if( required_aas <= 0 ) required_aas = 1;
3457 if( required_aa1 <= 0 ) required_aa1 = 1;
3458
3459 // if minimal alignment length > len, return
3460 // I can not return earlier, because I need to calc the word_encodes etc
3461 if (options.min_control>len) return 0; // return flag=0
3462
3463 int aan_no = len - options.NAA + 1;
3464
3465 // contained_in_old_lib()
3466 int len_upper_bound = param.len_upper_bound;
3467 int len_lower_bound = param.len_lower_bound;
3468 int band_left, band_right, best_score, band_width1, best_sum, len2, alnln, len_eff1;
3469 int tiden_no, band_center;
3470 float tiden_pc, distance=0;
3471 int talign_info[5];
3472 int j0, comp, lens;
3473 char *seqj;
3474
3475 for(comp=0; comp<2; comp++){
3476 if( comp ){
3477 for (j0=0; j0<aan_no; j0++) {
3478 j = word_encodes[j0];
3479 if ( j<0 ) aan_list_comp[j0] = j;
3480 else aan_list_comp[j0] = Comp_AAN_idx[j];
3481 }
3482 make_comp_iseq(len, seqi_comp, seqi);
3483 seqi = seqi_comp;
3484 }
3485 int has_aas = 0;
3486
3487 if( comp ){
3488 table.CountWords(aan_no, aan_list_comp, word_encodes_no, lookCounts, indexMapping, true, required_aan );
3489 }else{
3490 table.CountWords(aan_no, word_encodes, word_encodes_no, lookCounts, indexMapping, true, required_aan );
3491 }
3492
3493 IndexCount *ic = lookCounts.items;
3494 ic = lookCounts.items;
3495 for(; ic->count; ic++){
3496 indexMapping[ ic->index ] = 0;
3497 if ( ic->count < required_aan ) continue;
3498 Sequence *rep = table.sequences[ic->index];
3499
3500 len2 = rep->size;
3501 if (len2 > len_upper_bound ) continue;
3502 if (options.has2D && len2 < len_lower_bound ) continue;
3503
3504 seqj = rep->data;
3505
3506 param.ControlLongCoverage( len2, options );
3507
3508 if ( has_aas == 0 ) { // calculate AAP array
3509 buf.ComputeAAP2( seqi, seq->size );
3510 has_aas = 1;
3511 }
3512
3513 band_width1 = (options.band_width < len+len2-2 ) ? options.band_width : len+len2-2;
3514 diag_test_aapn_est(NAA1, seqj, len, len2, buf, best_sum,
3515 band_width1, band_left, band_center, band_right, required_aa1);
3516 if ( best_sum < required_aas ) continue;
3517 //if( comp and flag and (not options.cluster_best) and j > rep->cluster_id ) goto Break;
3518
3519 int rc = FAILED_FUNC;
3520 if (options.print || aln_cover_flag){ //return overlap region
3521 rc = local_band_align(seqi, seqj, len, len2, mat,
3522 best_score, tiden_no, alnln, distance, talign_info,
3523 band_left, band_center, band_right, buf);
3524 if( comp ){
3525 talign_info[0] = len - talign_info[0] - 1;
3526 talign_info[1] = len - talign_info[1] - 1;
3527 }
3528 }else{
3529 //printf( "%5i %5i %5i %5i\n", band_width1, band_right-band_left, band_left, band_right );
3530 rc = local_band_align(seqi, seqj, len, len2, mat,
3531 best_score, tiden_no, alnln, distance, talign_info,
3532 band_left, band_center, band_right, buf);
3533 }
3534 if ( rc == FAILED_FUNC ) continue;
3535 //printf( "%i %i %i\n", best_score, tiden_no, required_aa1 );
3536 if ( tiden_no < required_aa1 ) continue;
3537 if ( options.is454 ){
3538 if (talign_info[2] != talign_info[0]) continue; // same start
3539 if (talign_info[0] > 1) continue; // one mismatch allowed at beginning
3540 if ((len-talign_info[1]) > 2) continue; // one mismatch allowed at end
3541 }
3542
3543 lens = len;
3544 if( options.has2D && len > len2 ) lens = len2;
3545 len_eff1 = (options.global_identity == 0) ? alnln : (lens - talign_info[4]);
3546 tiden_pc = tiden_no / (float)len_eff1;
3547 //printf( "%i %f\n", tiden_no, tiden_pc );
3548 if( options.useDistance ){
3549 if (distance > options.distance_thd ) continue;
3550 if (options.cluster_best and distance >= seq->distance) continue; // existing distance
3551 }else{
3552 if (tiden_pc < options.cluster_thd) continue;
3553 if (options.cluster_best and tiden_pc < seq->identity) continue; // existing iden_no
3554 }
3555 if (aln_cover_flag) {
3556 if ( talign_info[3]-talign_info[2]+1 < min_aln_lenL) continue;
3557 if( comp ){
3558 if ( talign_info[0]-talign_info[1]+1 < min_aln_lenS) continue;
3559 }else{
3560 if ( talign_info[1]-talign_info[0]+1 < min_aln_lenS) continue;
3561 }
3562 }
3563 if( options.cluster_best and fabs(tiden_pc - seq->identity) < 1E-9 and rep->cluster_id >= seq->cluster_id ) continue;
3564 if( (not options.cluster_best) and flag !=0 and rep->cluster_id >= seq->cluster_id ) continue;
3565 flag = comp ? -1 : 1;
3566 seq->identity = tiden_pc;
3567 seq->distance = distance;
3568 seq->cluster_id = rep->cluster_id;
3569 seq->coverage[0] = talign_info[0] +1;
3570 seq->coverage[1] = talign_info[1] +1;
3571 seq->coverage[2] = talign_info[2] +1;
3572 seq->coverage[3] = talign_info[3] +1;
3573 if (not options.cluster_best) break;
3574 }
3575 while( ic->count ){
3576 indexMapping[ ic->index ] = 0;
3577 ic += 1;
3578 }
3579 lookCounts.size = 0;
3580 if (not options.option_r ) break;
3581 }
3582 if ((flag == 1) || (flag == -1)) { // if similar to old one delete it
3583 if (! options.cluster_best) {
3584 seq->Clear();
3585 seq->state |= IS_REDUNDANT ;
3586 }
3587 if( flag == -1 )
3588 seq->state |= IS_MINUS_STRAND;
3589 else
3590 seq->state &= ~IS_MINUS_STRAND;
3591 }
3592 return flag;
3593 }
ComputeDistance(const Options & options)3594 void SequenceDB::ComputeDistance( const Options & options )
3595 {
3596 int i, j, N = sequences.size();
3597 int best_score, best_sum;
3598 int band_width1, band_left, band_center, band_right, required_aa1;
3599 int tiden_no, alnln;
3600 int talign_info[5];
3601 float distance;
3602 WorkingBuffer buf( N, max_len, options );
3603
3604 Vector<NVector<float> > dists( N, NVector<float>(N) );
3605
3606 Sequence comseq( *sequences[0] );
3607
3608 for(i=0; i<N; i++){
3609 Sequence *seq = sequences[i];
3610 char *seqi = seq->data;
3611 int len = seq->size;
3612 buf.EncodeWords( seq, options.NAA, false );
3613 buf.ComputeAAP2( seqi, seq->size );
3614 dists[i][i] = 0.0;
3615 if((i+1)%1000 ==0) printf( "%9i\n", (i+1) );
3616 for(j=0; j<i; j++){
3617 Sequence *rep = sequences[j];
3618 char *seqj = rep->data;
3619 int len2 = rep->size;
3620 band_width1 = (options.band_width < len+len2-2 ) ? options.band_width : len+len2-2;
3621 diag_test_aapn_est(NAA1, seqj, len, len2, buf, best_sum,
3622 band_width1, band_left, band_center, band_right, 0);
3623 local_band_align(seqi, seqj, len, len2, mat,
3624 best_score, tiden_no, alnln, distance, talign_info,
3625 band_left, band_center, band_right, buf);
3626 dists[seq->index][rep->index] = dists[rep->index][seq->index] = distance;
3627 }
3628 if (not options.option_r ) break;
3629 comseq.index = seq->index;
3630 comseq.size = len;
3631 for(j=0; j<len; j++) comseq.data[i] = seq->data[len-i-1];
3632 seqi = comseq.data;
3633 buf.EncodeWords( &comseq, options.NAA, false );
3634 buf.ComputeAAP2( seqi, seq->size );
3635 for(j=0; j<i; j++){
3636 Sequence *rep = sequences[j];
3637 char *seqj = rep->data;
3638 int len2 = rep->size;
3639 band_width1 = (options.band_width < len+len2-2 ) ? options.band_width : len+len2-2;
3640 diag_test_aapn_est(NAA1, seqj, len, len2, buf, best_sum,
3641 band_width1, band_left, band_center, band_right, 0);
3642 local_band_align(seqi, seqj, len, len2, mat,
3643 best_score, tiden_no, alnln, distance, talign_info,
3644 band_left, band_center, band_right, buf);
3645 if( distance < dists[seq->index][rep->index] )
3646 dists[seq->index][rep->index] = dists[rep->index][seq->index] = distance;
3647 }
3648 }
3649 std::string output = options.output + ".dist";
3650 FILE *fout = fopen( output.c_str(), "w+" );
3651 fprintf( fout, "1" );
3652 for(i=1; i<N; i++) fprintf( fout, "\t%i", i+1 );
3653 fprintf( fout, "\n" );
3654 for(i=0; i<N; i++){
3655 fprintf( fout, "%g", dists[i][0] );
3656 for(j=1; j<N; j++) fprintf( fout, "\t%g", dists[i][j] );
3657 fprintf( fout, "\n" );
3658 }
3659 fclose( fout );
3660 }
DoClustering(const Options & options)3661 void SequenceDB::DoClustering( const Options & options )
3662 {
3663 int i;
3664 int NAA = options.NAA;
3665 double aa1_cutoff = options.cluster_thd;
3666 double aas_cutoff = 1 - (1-options.cluster_thd)*4;
3667 double aan_cutoff = 1 - (1-options.cluster_thd)*options.NAA;
3668 int seq_no = sequences.size();
3669 int frag_no = seq_no;
3670 int frag_size = options.frag_size;
3671 int len, len_bound;
3672 int flag;
3673
3674 #if 0
3675 ComputeDistance( options );
3676 return;
3677 #endif
3678
3679 if( options.threads > 1 ){
3680 DoClustering( options.threads, options );
3681 temp_files.Clear();
3682 return;
3683 }
3684
3685 if (frag_size){
3686 frag_no = 0;
3687 for (i=0; i<seq_no; i++) frag_no += (sequences[i]->size - NAA) / frag_size + 1;
3688 }
3689
3690 if( not options.isEST )
3691 cal_aax_cutoff(aa1_cutoff, aas_cutoff, aan_cutoff, options.cluster_thd,
3692 options.tolerance, naa_stat_start_percent, naa_stat, NAA);
3693
3694 WorkingParam param( aa1_cutoff, aas_cutoff, aan_cutoff );
3695 WorkingBuffer buffer( frag_no, max_len, options );
3696
3697 WordTable word_table( options.NAA, NAAN );
3698
3699 size_t mem_need = MinimalMemory( frag_no, buffer.total_bytes, 1, options );
3700 size_t mem_limit = MemoryLimit( mem_need, options );
3701 size_t mem, mega = 1000000;
3702 int N = sequences.size();
3703
3704 size_t total_letters = total_letter;
3705 size_t tabsize = 0;
3706
3707 Options opts( options );
3708 opts.ComputeTableLimits( min_len, max_len, len_n50, mem_need );
3709
3710 for(i=0; i<N; ){
3711 float redundancy = (rep_seqs.size() + 1.0) / (i + 1.0);
3712 int m = i;
3713 size_t sum = 0;
3714 size_t max_items = opts.max_entries;
3715 size_t max_seqs = opts.max_sequences;
3716 size_t items = 0;
3717
3718 // find a block from i to m, so that this block can fit into a word table
3719 // ...
3720 // i ++++++++++++++++++++++++++
3721 // ++++++++++++++++++++
3722 // ++++++++++++++++
3723 // m +++++++++++++
3724 // ...
3725 while( m < N && (sum*redundancy) < max_seqs && items < max_items ){
3726 Sequence *seq = sequences[m];
3727 if( ! (seq->state & IS_REDUNDANT) ){
3728 if ( options.store_disk ) seq->SwapIn();
3729 items += (size_t)(seq->size * redundancy);
3730 sum += 1;
3731 }
3732 m ++;
3733 }
3734 if( m > N ) m = N;
3735 printf( "\rcomparing sequences from %9i to %9i\n", i, m );
3736 fflush( stdout );
3737 for(int ks=i; ks<m; ks++){ // clustering this block
3738 Sequence *seq = sequences[ks];
3739 i = ks + 1;
3740 if (seq->state & IS_REDUNDANT) continue;
3741 ClusterOne( seq, ks, word_table, param, buffer, options );
3742 total_letters -= seq->size;
3743 if( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3744 if( word_table.size >= max_items ) break;
3745 int tmax = max_seqs - (frag_size ? seq->size / frag_size + 1 : 0);
3746 if( word_table.sequences.size() >= tmax ) break;
3747 } // finishing word table from this block
3748 m = i;
3749 if( word_table.size == 0 ) continue;
3750 float p0 = 0;
3751 for(int j=m; j<N; j++){ // use this word table to screen rest sequences m->N
3752 Sequence *seq = sequences[j];
3753 if (seq->state & IS_REDUNDANT) continue;
3754 if ( options.store_disk ) seq->SwapIn();
3755 CheckOne( seq, word_table, param, buffer, options );
3756 total_letters -= seq->size;
3757 if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3758 int len_bound = param.len_upper_bound;
3759 if( word_table.sequences[ word_table.sequences.size()-1 ]->size > len_bound ){
3760 break;
3761 }
3762 float p = (100.0*j)/N;
3763 if( p > p0+1E-1 ){ // print only if the percentage changed
3764 printf( "\r%4.1f%%", p );
3765 fflush( stdout );
3766 p0 = p;
3767 }
3768 }
3769 if( word_table.size > tabsize ) tabsize = word_table.size;
3770 //if( i && i < m ) printf( "\r---------- %6i remaining sequences to the next cycle\n", m-i );
3771 word_table.Clear();
3772 }
3773 printf( "\n%9i finished %9i clusters\n", sequences.size(), rep_seqs.size() );
3774 mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
3775 printf( "\nApproximated maximum memory consumption: %liM\n", mem );
3776 temp_files.Clear();
3777 word_table.Clear();
3778
3779 #if 0
3780 int zeros = 0;
3781 for(i=0; i<word_table.indexCounts.size(); i++) zeros += word_table.indexCounts[i].Size() ==0;
3782 printf( "%9i empty entries out of %9i\n", zeros, word_table.indexCounts.size() );
3783 #endif
3784 }
3785
ClusterTo(SequenceDB & other,const Options & options)3786 void SequenceDB::ClusterTo( SequenceDB & other, const Options & options )
3787 {
3788 int i, is, js, flag;
3789 int len, len_tmp, len_lower_bound, len_upper_bound;
3790 int NR2_red_no = 0;
3791 int aan_no = 0;
3792 char *seqi;
3793 int NAA = options.NAA;
3794 double aa1_cutoff = options.cluster_thd;
3795 double aas_cutoff = 1 - (1-options.cluster_thd)*4;
3796 double aan_cutoff = 1 - (1-options.cluster_thd)*options.NAA;
3797 Vector<int> word_encodes( MAX_SEQ );
3798 Vector<INTs> word_encodes_no( MAX_SEQ );
3799
3800 if( not options.isEST ){
3801 cal_aax_cutoff(aa1_cutoff, aas_cutoff, aan_cutoff, options.cluster_thd,
3802 options.tolerance, naa_stat_start_percent, naa_stat, NAA);
3803 }
3804
3805
3806 int N = other.sequences.size();
3807 int M = sequences.size();
3808 int T = options.threads;
3809
3810 valarray<size_t> counts(T);
3811 Vector<WorkingParam> params(T);
3812 Vector<WorkingBuffer> buffers(T);
3813 WorkingParam & param = params[0];
3814 WorkingBuffer & buffer = buffers[0];
3815 for(i=0; i<T; i++){
3816 params[i].Set( aa1_cutoff, aas_cutoff, aan_cutoff );
3817 buffers[i].Set( N, max_len, options );
3818 }
3819 if( T >1 ) omp_set_num_threads(T);
3820
3821 size_t mem_need = MinimalMemory( N, buffer.total_bytes, T, options, other.total_letter+other.total_desc );
3822 size_t mem_limit = MemoryLimit( mem_need, options );
3823
3824 Options opts( options );
3825 opts.ComputeTableLimits( min_len, max_len, len_n50, mem_need );
3826
3827 WordTable word_table( options.NAA, NAAN );
3828
3829 size_t max_items = opts.max_entries;
3830 size_t max_seqs = opts.max_sequences;
3831 for(i=0; i<N; ){
3832 size_t items = 0;
3833 size_t sum = 0;
3834 int m = i;
3835 while( m < N && sum < max_seqs && items < max_items ){
3836 Sequence *seq = other.sequences[m];
3837 if( ! (seq->state & IS_REDUNDANT) ){
3838 if ( options.store_disk ) seq->SwapIn();
3839 items += seq->size;
3840 sum += 1;
3841 }
3842 m ++;
3843 }
3844 if( m > N ) m = N;
3845 //printf( "m = %i %i, %i\n", i, m, m-i );
3846 for(int ks=i; ks<m; ks++){
3847 Sequence *seq = other.sequences[ks];
3848 len = seq->size;
3849 seqi = seq->data;
3850 calc_ann_list(len, seqi, NAA, aan_no, word_encodes, word_encodes_no, options.isEST);
3851 word_table.AddWordCounts(aan_no, word_encodes, word_encodes_no, ks-i, options.isEST);
3852 word_table.sequences.Append( seq );
3853 seq->cluster_id = ks;
3854 seq->state |= IS_REP;
3855 if ( (ks+1) % 1000 == 0 ) {
3856 printf( "." );
3857 fflush( stdout );
3858 if ( (ks+1) % 10000 == 0 ) printf( "%9i finished\n", ks+1 );
3859 }
3860 }
3861 float p0 = 0;
3862 if( T > 1 ){
3863 int JM = M;
3864 counts = 0;
3865 #pragma omp parallel for schedule( dynamic, 1 )
3866 for(int j=0; j<JM; j++){
3867 Sequence *seq = sequences[j];
3868 if( seq->state & IS_REDUNDANT ) continue;
3869 int len = seq->size;
3870 char *seqi = seq->data;
3871 int len_upper_bound = upper_bound_length_rep(len,options);
3872 int len_lower_bound = len - options.diff_cutoff_aa2;
3873
3874 int len_tmp = (int) ( ((double)len) * options.diff_cutoff2);
3875 if (len_tmp < len_lower_bound) len_lower_bound = len_tmp;
3876
3877 int tid = omp_get_thread_num();
3878 params[tid].len_upper_bound = len_upper_bound;
3879 params[tid].len_lower_bound = len_lower_bound;
3880
3881 if( word_table.sequences[ word_table.sequences.size()-1 ]->size > len_upper_bound ){
3882 JM = 0;
3883 continue;
3884 }
3885
3886 int flag = other.CheckOne( seq, word_table, params[tid], buffers[tid], options );
3887 if ((flag == 1) || (flag == -1)) { // if similar to old one delete it
3888 if (! options.cluster_best) {
3889 seq->Clear();
3890 seq->state |= IS_REDUNDANT ;
3891 counts[tid] ++;
3892 }
3893 if( flag == -1 ) seq->state |= IS_MINUS_STRAND; // for EST only
3894 }
3895 float p = (100.0*j)/M;
3896 if( p > p0+1E-1 ){ // print only if the percentage changed
3897 printf( "\r%4.1f%%", p );
3898 fflush( stdout );
3899 p0 = p;
3900 }
3901 }
3902 for(int j=0; j<T; j++) NR2_red_no += counts[j];
3903 }else{
3904 for(int j=0; j<M; j++){
3905 Sequence *seq = sequences[j];
3906 if( seq->state & IS_REDUNDANT ) continue;
3907 len = seq->size;
3908 seqi = seq->data;
3909 len_upper_bound = upper_bound_length_rep(len,options);
3910 len_lower_bound = len - options.diff_cutoff_aa2;
3911
3912 len_tmp = (int) ( ((double)len) * options.diff_cutoff2);
3913 if (len_tmp < len_lower_bound) len_lower_bound = len_tmp;
3914 param.len_upper_bound = len_upper_bound;
3915 param.len_lower_bound = len_lower_bound;
3916
3917 if( word_table.sequences[ word_table.sequences.size()-1 ]->size > len_upper_bound ){
3918 break;
3919 }
3920
3921 flag = other.CheckOne( seq, word_table, param, buffer, options );
3922 if ((flag == 1) || (flag == -1)) { // if similar to old one delete it
3923 if (! options.cluster_best) {
3924 seq->Clear();
3925 seq->state |= IS_REDUNDANT ;
3926 NR2_red_no ++;
3927 }
3928 if( flag == -1 ) seq->state |= IS_MINUS_STRAND; // for EST only
3929 }
3930 float p = (100.0*j)/M;
3931 if( p > p0+1E-1 ){ // print only if the percentage changed
3932 printf( "\r%4.1f%%", p );
3933 fflush( stdout );
3934 p0 = p;
3935 }
3936 }
3937 }
3938 printf( "\r..........%9i compared %9i clusters\n", i, NR2_red_no );
3939 word_table.Clear();
3940 word_table.size = 0;
3941 i = m;
3942 }
3943
3944 if (options.cluster_best) {//delete redundant sequences in options.cluster_best mode
3945 for (i=0; i<(int)sequences.size(); i++){
3946 Sequence *seq = sequences[i];
3947 if (seq->identity > 0 ){
3948 seq->state |= IS_REDUNDANT;
3949 NR2_red_no ++;
3950 }
3951 }
3952 }
3953 for (i=0; i<(int)sequences.size(); i++){
3954 Sequence *seq = sequences[i];
3955 if( seq->identity <0 ) seq->identity *= -1;
3956 if( not(seq->state & IS_REDUNDANT) ) rep_seqs.Append( i );
3957 }
3958
3959 cout << endl;
3960 cout << sequences.size() << " compared\t" << NR2_red_no << " clustered" << endl;
3961 temp_files.Clear();
3962 }
3963
calc_ann_list(int len,char * seqi,int NAA,int & aan_no,Vector<int> & aan_list,Vector<INTs> & aan_list_no,bool est)3964 int calc_ann_list(int len, char *seqi, int NAA, int& aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no, bool est)
3965 {
3966 int i, j, k, i0, i1, k1;
3967
3968 // check_aan_list
3969 aan_no = len - NAA + 1;
3970 for (j=0; j<aan_no; j++) {
3971 aan_list[j] = 0;
3972 for (k=0, k1=NAA-1; k<NAA; k++, k1--) aan_list[j] += seqi[j+k] * NAAN_array[k1];
3973 }
3974 if( est ){
3975 // for the short word containing 'N', mask it to '-1'
3976 for (j=0; j<len; j++){
3977 if ( seqi[j] >= 4 ) { // here N is 4
3978 i0 = (j-NAA+1 > 0) ? j-NAA+1 : 0;
3979 i1 = j < aan_no ? j : aan_no - 1;
3980 for (i=i0; i<=i1; i++) aan_list[i]=-1;
3981 }
3982 }
3983 }
3984
3985 std::sort(aan_list.begin(), aan_list.begin() + aan_no);
3986 for(j=0; j<aan_no; j++) aan_list_no[j]=1;
3987 for(j=aan_no-1; j; j--) {
3988 if (aan_list[j] == aan_list[j-1]) {
3989 aan_list_no[j-1] += aan_list_no[j];
3990 aan_list_no[j]=0;
3991 }
3992 }
3993 return OK_FUNC;
3994 } // END calc_ann_list
3995
make_comp_short_word_index(int NAA,int * NAAN_array,Vector<int> & Comp_AAN_idx)3996 void make_comp_short_word_index(int NAA, int *NAAN_array, Vector<int> &Comp_AAN_idx) {
3997 int i, j, k, icomp, k1;
3998 int c[4] = {3,2,1,0};
3999 unsigned char short_word[32]; //short_word[12] is enough
4000
4001 int NAA1 = NAAN_array[1];
4002 int NAAN = NAAN_array[NAA];
4003
4004 for (i=0; i<NAAN; i++) {
4005 // decompose i back to short_word
4006 for (k=i, j=0; j<NAA; j++) {
4007 short_word[j] = (unsigned char) (k % NAA1);
4008 k = k / NAA1;
4009 }
4010
4011 // calc_comp_aan_list
4012 icomp=0;
4013 for (k=0, k1=NAA-1; k<NAA; k++, k1--) icomp += c[short_word[k1]] * NAAN_array[k];
4014
4015 Comp_AAN_idx[i] = icomp;
4016 }
4017 } // make_comp_short_word_index
4018
4019 //quick_sort_idx calling (a, idx, 0, no-1)
4020 //sort a with another array idx
4021 //so that idx rearranged
quick_sort_idx(int * a,int * idx,int lo0,int hi0)4022 int quick_sort_idx (int *a, int *idx, int lo0, int hi0 ) {
4023 int lo = lo0;
4024 int hi = hi0;
4025 int mid;
4026 int tmp;
4027
4028 if ( hi0 > lo0) {
4029 mid = a[ ( lo0 + hi0 ) / 2 ];
4030
4031 while( lo <= hi ) {
4032 while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
4033 while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
4034 if( lo <= hi ) {
4035 tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
4036 tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
4037 lo++; hi--;
4038 }
4039 } // while
4040
4041 if( lo0 < hi ) quick_sort_idx(a, idx, lo0, hi );
4042 if( lo < hi0 ) quick_sort_idx(a, idx, lo, hi0 );
4043 } // if ( hi0 > lo0)
4044 return 0;
4045 } // quick_sort_idx
4046
4047
4048 //decreasing can not use reverse of quick_sort_idx due to tie
4049 //quick_sort_idxr calling (a, idx, 0, no-1)
4050 //sort a with another array idx
4051 //so that idx rearranged
quick_sort_idxr(int * a,int * idx,int lo0,int hi0)4052 int quick_sort_idxr (int *a, int *idx, int lo0, int hi0 ) {
4053 int lo = lo0;
4054 int hi = hi0;
4055 int mid;
4056 int tmp;
4057
4058 if ( hi0 > lo0) {
4059 mid = a[ ( lo0 + hi0 ) / 2 ];
4060
4061 while( lo <= hi ) {
4062 while( ( lo < hi0 ) && ( a[lo] > mid ) ) lo++;
4063 while( ( hi > lo0 ) && ( a[hi] < mid ) ) hi--;
4064 if( lo <= hi ) {
4065 tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
4066 tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
4067 lo++; hi--;
4068 }
4069 } // while
4070
4071 if( lo0 < hi ) quick_sort_idxr(a, idx, lo0, hi );
4072 if( lo < hi0 ) quick_sort_idxr(a, idx, lo, hi0 );
4073 } // if ( hi0 > lo0)
4074 return 0;
4075 } // quick_sort_idxr
4076
4077 /////////////////////////// END ALL ////////////////////////
4078
4079 int naa_stat_start_percent = 40;
4080 int naa_stat[5][61][4] = {
4081
4082 // cover 0.99
4083 {
4084 // N=5 N=4 N=3 N=2
4085 { 0, 0, 0, 7, }, // 40%
4086 { 0, 0, 0, 8, }, // 41%
4087 { 0, 0, 0, 9, }, // 42%
4088 { 0, 0, 0, 9, }, // 43%
4089 { 0, 0, 1, 10, }, // 44%
4090 { 0, 0, 1, 11, }, // 45%
4091 { 0, 0, 1, 12, }, // 46%
4092 { 0, 0, 2, 13, }, // 47%
4093 { 0, 0, 2, 14, }, // 48%
4094 { 0, 0, 4, 16, }, // 49%
4095 { 0, 0, 4, 16, }, // 50%
4096 { 0, 0, 5, 17, }, // 51%
4097 { 0, 0, 5, 18, }, // 52%
4098 { 0, 0, 7, 20, }, // 53%
4099 { 0, 1, 7, 21, }, // 54%
4100 { 0, 1, 7, 21, }, // 55%
4101 { 0, 2, 8, 23, }, // 56%
4102 { 0, 2, 8, 25, }, // 57%
4103 { 0, 2, 10, 25, }, // 58%
4104 { 0, 3, 10, 26, }, // 59%
4105 { 0, 4, 13, 28, }, // 60%
4106 { 0, 5, 13, 30, }, // 61%
4107 { 0, 5, 14, 30, }, // 62%
4108 { 1, 6, 15, 33, }, // 63%
4109 { 2, 7, 17, 34, }, // 64%
4110 { 2, 7, 17, 35, }, // 65%
4111 { 2, 9, 20, 37, }, // 66%
4112 { 4, 10, 20, 37, }, // 67%
4113 { 4, 11, 22, 40, }, // 68%
4114 { 5, 12, 24, 41, }, // 69%
4115 { 5, 12, 25, 42, }, // 70%
4116 { 6, 16, 27, 43, }, // 71%
4117 { 8, 16, 27, 45, }, // 72%
4118 { 9, 17, 29, 47, }, // 73%
4119 { 10, 18, 31, 47, }, // 74%
4120 { 10, 20, 32, 50, }, // 75%
4121 { 12, 20, 32, 51, }, // 76%
4122 { 14, 22, 36, 54, }, // 77%
4123 { 15, 24, 37, 55, }, // 78%
4124 { 17, 26, 41, 58, }, // 79%
4125 { 18, 29, 41, 59, }, // 80%
4126 { 20, 30, 45, 60, }, // 81%
4127 { 24, 35, 48, 62, }, // 82%
4128 { 26, 36, 48, 64, }, // 83%
4129 { 27, 38, 51, 65, }, // 84%
4130 { 31, 43, 54, 68, }, // 85%
4131 { 35, 43, 55, 70, }, // 86%
4132 { 36, 48, 60, 71, }, // 87%
4133 { 36, 50, 61, 73, }, // 88%
4134 { 40, 50, 61, 75, }, // 89%
4135 { 45, 54, 65, 75, }, // 90%
4136 { 52, 60, 70, 79, }, // 91%
4137 { 53, 62, 71, 81, }, // 92%
4138 { 57, 66, 75, 84, }, // 93%
4139 { 57, 66, 76, 85, }, // 94%
4140 { 64, 71, 78, 85, }, // 95%
4141 { 70, 75, 82, 89, }, // 96%
4142 { 77, 81, 86, 92, }, // 97%
4143 { 82, 86, 90, 94, }, // 98%
4144 { 83, 87, 91, 95, }, // 99%
4145 { 91, 93, 95, 97, }, // 100%
4146 },
4147 // cover 0.95
4148 {
4149 // N=5 N=4 N=3 N=2
4150 { 0, 0, 1, 9, }, // 40%
4151 { 0, 0, 2, 10, }, // 41%
4152 { 0, 0, 2, 11, }, // 42%
4153 { 0, 0, 3, 12, }, // 43%
4154 { 0, 0, 3, 12, }, // 44%
4155 { 0, 0, 4, 14, }, // 45%
4156 { 0, 0, 4, 14, }, // 46%
4157 { 0, 1, 5, 16, }, // 47%
4158 { 0, 1, 6, 17, }, // 48%
4159 { 0, 2, 7, 19, }, // 49%
4160 { 0, 2, 8, 19, }, // 50%
4161 { 0, 2, 8, 20, }, // 51%
4162 { 0, 2, 9, 21, }, // 52%
4163 { 0, 4, 10, 23, }, // 53%
4164 { 1, 4, 11, 24, }, // 54%
4165 { 1, 4, 11, 24, }, // 55%
4166 { 1, 5, 13, 26, }, // 56%
4167 { 2, 5, 13, 27, }, // 57%
4168 { 2, 6, 15, 29, }, // 58%
4169 { 2, 7, 15, 30, }, // 59%
4170 { 3, 8, 16, 31, }, // 60%
4171 { 4, 8, 18, 32, }, // 61%
4172 { 4, 9, 18, 33, }, // 62%
4173 { 5, 11, 20, 36, }, // 63%
4174 { 6, 12, 22, 37, }, // 64%
4175 { 6, 12, 22, 38, }, // 65%
4176 { 8, 14, 24, 40, }, // 66%
4177 { 8, 15, 25, 41, }, // 67%
4178 { 10, 16, 27, 42, }, // 68%
4179 { 10, 18, 28, 45, }, // 69%
4180 { 11, 18, 29, 45, }, // 70%
4181 { 14, 21, 31, 47, }, // 71%
4182 { 14, 22, 32, 48, }, // 72%
4183 { 14, 22, 33, 50, }, // 73%
4184 { 17, 24, 36, 52, }, // 74%
4185 { 17, 25, 36, 52, }, // 75%
4186 { 18, 27, 39, 54, }, // 76%
4187 { 20, 29, 41, 56, }, // 77%
4188 { 21, 31, 42, 58, }, // 78%
4189 { 21, 31, 46, 60, }, // 79%
4190 { 27, 35, 46, 60, }, // 80%
4191 { 28, 37, 50, 63, }, // 81%
4192 { 31, 38, 50, 64, }, // 82%
4193 { 34, 43, 53, 66, }, // 83%
4194 { 36, 45, 54, 67, }, // 84%
4195 { 41, 50, 60, 70, }, // 85%
4196 { 43, 51, 60, 71, }, // 86%
4197 { 45, 54, 63, 74, }, // 87%
4198 { 48, 55, 64, 75, }, // 88%
4199 { 54, 60, 68, 78, }, // 89%
4200 { 55, 62, 71, 80, }, // 90%
4201 { 56, 63, 71, 80, }, // 91%
4202 { 64, 70, 76, 84, }, // 92%
4203 { 69, 74, 80, 86, }, // 93%
4204 { 73, 78, 83, 88, }, // 94%
4205 { 74, 78, 84, 89, }, // 95%
4206 { 80, 84, 87, 91, }, // 96%
4207 { 83, 86, 90, 93, }, // 97%
4208 { 86, 89, 92, 95, }, // 98%
4209 { 91, 93, 95, 97, }, // 99%
4210 { 92, 93, 95, 97, }, // 100%
4211 },
4212 // cover 0.9
4213 {
4214 // N=5 N=4 N=3 N=2
4215 { 0, 0, 2, 11, }, // 40%
4216 { 0, 0, 3, 12, }, // 41%
4217 { 0, 0, 3, 12, }, // 42%
4218 { 0, 1, 4, 13, }, // 43%
4219 { 0, 1, 5, 14, }, // 44%
4220 { 0, 1, 5, 15, }, // 45%
4221 { 0, 1, 6, 16, }, // 46%
4222 { 0, 2, 7, 18, }, // 47%
4223 { 0, 2, 7, 18, }, // 48%
4224 { 0, 3, 9, 20, }, // 49%
4225 { 1, 4, 9, 20, }, // 50%
4226 { 1, 4, 10, 21, }, // 51%
4227 { 1, 4, 11, 23, }, // 52%
4228 { 2, 5, 12, 24, }, // 53%
4229 { 2, 5, 12, 25, }, // 54%
4230 { 2, 6, 13, 26, }, // 55%
4231 { 3, 7, 14, 28, }, // 56%
4232 { 3, 7, 15, 28, }, // 57%
4233 { 4, 8, 16, 30, }, // 58%
4234 { 5, 9, 17, 31, }, // 59%
4235 { 5, 10, 18, 32, }, // 60%
4236 { 6, 11, 20, 35, }, // 61%
4237 { 6, 11, 20, 35, }, // 62%
4238 { 7, 13, 22, 38, }, // 63%
4239 { 8, 14, 23, 39, }, // 64%
4240 { 8, 15, 24, 39, }, // 65%
4241 { 10, 16, 26, 42, }, // 66%
4242 { 10, 17, 27, 42, }, // 67%
4243 { 12, 19, 29, 44, }, // 68%
4244 { 13, 20, 30, 46, }, // 69%
4245 { 13, 21, 31, 47, }, // 70%
4246 { 16, 23, 33, 48, }, // 71%
4247 { 18, 25, 34, 50, }, // 72%
4248 { 18, 26, 36, 51, }, // 73%
4249 { 19, 28, 38, 53, }, // 74%
4250 { 20, 29, 38, 53, }, // 75%
4251 { 23, 30, 41, 56, }, // 76%
4252 { 24, 33, 43, 57, }, // 77%
4253 { 26, 34, 45, 59, }, // 78%
4254 { 28, 37, 48, 61, }, // 79%
4255 { 30, 37, 48, 62, }, // 80%
4256 { 33, 42, 52, 64, }, // 81%
4257 { 35, 43, 53, 65, }, // 82%
4258 { 38, 47, 56, 68, }, // 83%
4259 { 40, 47, 56, 68, }, // 84%
4260 { 44, 53, 61, 71, }, // 85%
4261 { 45, 53, 62, 73, }, // 86%
4262 { 50, 58, 66, 75, }, // 87%
4263 { 51, 58, 66, 76, }, // 88%
4264 { 57, 63, 71, 79, }, // 89%
4265 { 60, 66, 72, 81, }, // 90%
4266 { 62, 68, 75, 83, }, // 91%
4267 { 70, 74, 80, 85, }, // 92%
4268 { 74, 78, 82, 88, }, // 93%
4269 { 85, 87, 90, 92, }, // 94%
4270 { 86, 88, 90, 92, }, // 95%
4271 { 87, 89, 91, 93, }, // 96%
4272 { 87, 89, 92, 94, }, // 97%
4273 { 89, 91, 93, 96, }, // 98%
4274 { 93, 94, 96, 97, }, // 99%
4275 { 94, 95, 97, 98, }, // 100%
4276 },
4277 // cover 0.8
4278 {
4279 // N=5 N=4 N=3 N=2
4280 { 0, 1, 4, 13, }, // 40%
4281 { 0, 1, 5, 13, }, // 41%
4282 { 0, 1, 5, 14, }, // 42%
4283 { 0, 2, 6, 15, }, // 43%
4284 { 0, 2, 6, 16, }, // 44%
4285 { 0, 2, 7, 17, }, // 45%
4286 { 1, 3, 8, 18, }, // 46%
4287 { 1, 4, 9, 20, }, // 47%
4288 { 1, 4, 9, 20, }, // 48%
4289 { 2, 5, 11, 22, }, // 49%
4290 { 2, 5, 11, 22, }, // 50%
4291 { 2, 6, 12, 24, }, // 51%
4292 { 3, 6, 13, 25, }, // 52%
4293 { 3, 7, 14, 26, }, // 53%
4294 { 4, 8, 14, 27, }, // 54%
4295 { 4, 8, 15, 28, }, // 55%
4296 { 5, 9, 17, 30, }, // 56%
4297 { 5, 9, 17, 30, }, // 57%
4298 { 6, 11, 19, 32, }, // 58%
4299 { 7, 12, 20, 34, }, // 59%
4300 { 8, 12, 20, 34, }, // 60%
4301 { 9, 14, 22, 37, }, // 61%
4302 { 9, 14, 23, 37, }, // 62%
4303 { 10, 16, 25, 39, }, // 63%
4304 { 11, 17, 26, 41, }, // 64%
4305 { 12, 18, 27, 41, }, // 65%
4306 { 13, 20, 28, 43, }, // 66%
4307 { 14, 21, 30, 45, }, // 67%
4308 { 15, 22, 31, 46, }, // 68%
4309 { 17, 24, 33, 48, }, // 69%
4310 { 17, 24, 34, 48, }, // 70%
4311 { 19, 26, 36, 50, }, // 71%
4312 { 20, 27, 37, 51, }, // 72%
4313 { 21, 29, 39, 53, }, // 73%
4314 { 23, 31, 41, 55, }, // 74%
4315 { 23, 31, 41, 55, }, // 75%
4316 { 26, 34, 44, 58, }, // 76%
4317 { 28, 36, 46, 59, }, // 77%
4318 { 29, 37, 47, 60, }, // 78%
4319 { 34, 41, 50, 62, }, // 79%
4320 { 34, 42, 51, 63, }, // 80%
4321 { 38, 45, 55, 66, }, // 81%
4322 { 39, 46, 55, 67, }, // 82%
4323 { 44, 51, 60, 70, }, // 83%
4324 { 44, 51, 60, 70, }, // 84%
4325 { 49, 56, 64, 73, }, // 85%
4326 { 50, 57, 64, 74, }, // 86%
4327 { 57, 63, 69, 77, }, // 87%
4328 { 58, 64, 70, 78, }, // 88%
4329 { 68, 71, 76, 82, }, // 89%
4330 { 68, 72, 77, 83, }, // 90%
4331 { 75, 79, 81, 85, }, // 91%
4332 { 86, 87, 89, 90, }, // 92%
4333 { 88, 89, 90, 92, }, // 93%
4334 { 90, 91, 92, 93, }, // 94%
4335 { 91, 92, 93, 94, }, // 95%
4336 { 92, 94, 94, 95, }, // 96%
4337 { 93, 94, 95, 96, }, // 97%
4338 { 94, 95, 95, 96, }, // 98%
4339 { 94, 95, 96, 98, }, // 99%
4340 { 95, 96, 97, 98, }, // 100%
4341 },
4342 // cover 0.6
4343 {
4344 // N=5 N=4 N=3 N=2
4345 { 1, 2, 6, 15, }, // 40%
4346 { 1, 3, 7, 16, }, // 41%
4347 { 1, 3, 8, 17, }, // 42%
4348 { 2, 4, 9, 18, }, // 43%
4349 { 2, 4, 9, 19, }, // 44%
4350 { 2, 5, 10, 20, }, // 45%
4351 { 3, 5, 10, 21, }, // 46%
4352 { 3, 6, 12, 22, }, // 47%
4353 { 3, 6, 12, 23, }, // 48%
4354 { 4, 8, 14, 25, }, // 49%
4355 { 4, 8, 14, 25, }, // 50%
4356 { 5, 8, 15, 26, }, // 51%
4357 { 5, 9, 16, 27, }, // 52%
4358 { 6, 10, 17, 29, }, // 53%
4359 { 6, 11, 18, 30, }, // 54%
4360 { 7, 11, 18, 31, }, // 55%
4361 { 8, 12, 20, 32, }, // 56%
4362 { 8, 13, 20, 33, }, // 57%
4363 { 10, 14, 22, 35, }, // 58%
4364 { 10, 15, 23, 37, }, // 59%
4365 { 11, 16, 24, 37, }, // 60%
4366 { 12, 18, 26, 39, }, // 61%
4367 { 13, 18, 26, 40, }, // 62%
4368 { 14, 20, 28, 42, }, // 63%
4369 { 16, 22, 30, 43, }, // 64%
4370 { 16, 22, 31, 44, }, // 65%
4371 { 17, 23, 32, 45, }, // 66%
4372 { 18, 25, 33, 47, }, // 67%
4373 { 19, 26, 35, 48, }, // 68%
4374 { 21, 27, 36, 50, }, // 69%
4375 { 22, 29, 37, 51, }, // 70%
4376 { 24, 30, 39, 52, }, // 71%
4377 { 25, 32, 41, 53, }, // 72%
4378 { 26, 33, 42, 55, }, // 73%
4379 { 29, 35, 44, 57, }, // 74%
4380 { 29, 36, 45, 57, }, // 75%
4381 { 32, 39, 48, 60, }, // 76%
4382 { 34, 41, 50, 61, }, // 77%
4383 { 36, 43, 51, 62, }, // 78%
4384 { 40, 46, 54, 65, }, // 79%
4385 { 40, 46, 54, 65, }, // 80%
4386 { 46, 52, 59, 68, }, // 81%
4387 { 46, 52, 60, 69, }, // 82%
4388 { 53, 59, 65, 73, }, // 83%
4389 { 54, 60, 66, 73, }, // 84%
4390 { 63, 67, 73, 78, }, // 85%
4391 { 68, 71, 75, 79, }, // 86%
4392 { 78, 80, 82, 85, }, // 87%
4393 { 79, 81, 83, 85, }, // 88%
4394 { 83, 85, 86, 87, }, // 89%
4395 { 85, 86, 87, 89, }, // 90%
4396 { 86, 88, 89, 90, }, // 91%
4397 { 88, 89, 90, 91, }, // 92%
4398 { 90, 90, 91, 92, }, // 93%
4399 { 91, 92, 92, 93, }, // 94%
4400 { 92, 93, 94, 94, }, // 95%
4401 { 94, 94, 95, 95, }, // 96%
4402 { 95, 95, 96, 96, }, // 97%
4403 { 95, 96, 97, 97, }, // 98%
4404 { 96, 96, 97, 98, }, // 99%
4405 { 97, 98, 98, 99, }, // 100%
4406 },
4407 };
4408
4409