1 // =============================================================================
2 // CD-HI-EST
3 // http://cd-hit.org/
4 // Cluster Database at High Identity (EST version)
5 // modified from CD-HI
6 //
7 // program written by
8 // Weizhong Li
9 // UCSD, San Diego Supercomputer Center
10 // La Jolla, CA, 92093
11 // Email liwz@sdsc.edu
12 // at
13 // Adam Godzik's lab
14 // The Burnham Institute
15 // La Jolla, CA, 92037
16 // Email adam@burnham-inst.org
17 //
18 // Modified by:
19 // Limin Fu
20 // Center for Research in Biological Systems (CRBS), UCSD
21 // La Jolla, CA, 92093
22 // Email: l2fu@ucsd.edu, fu@daovm.net
23 // =============================================================================
24
25 #include "cdhit-common.h"
26 #include "cdhit-utility.h"
27 //over-write some defs in cd-hi.h for est version
28 #undef MAX_UAA
29 #define MAX_UAA 4
30
31 //over-write some defs in cd-hi-init.h for est version
32
33 void setaa_to_na();
34 void make_comp_short_word_index(int NAA, int *NAAN_array, Vector<int> & Comp_AAN_idx);
35 void make_comp_iseq(int len, char *iseq_comp, char *iseq);
36
37
38 Options options;
39 SequenceDB seq_db;
40
41 //////////////////////////////////// MAIN /////////////////////////////////////
main(int argc,char ** argv)42 int main(int argc, char **argv)
43 {
44 string db_in;
45 string db_out;
46 string db_in_pe;
47 string db_out_pe;
48
49 options.cluster_thd = 0.95;
50 options.NAA = 10;
51 options.NAAN = NAA8;
52 seq_db.NAAN = NAA8;
53 options.NAA_top_limit = 12;
54 setaa_to_na();
55 mat.set_to_na(); //mat.set_gap(-6,-1);
56
57 float begin_time = current_time();
58 float end_time;
59
60 // *********************************** parse command line and open file
61 if (argc < 5) print_usage_est(argv[0]);
62 if (options.SetOptions( argc, argv, false, true ) == 0) print_usage_est(argv[0]);
63 options.Validate();
64
65 db_in = options.input;
66 db_in_pe = options.input_pe;
67 db_out = options.output;
68 db_out_pe = options.output_pe;
69
70 InitNAA( MAX_UAA );
71 seq_db.NAAN = NAAN_array[options.NAA];
72
73 if ( options.option_r ) {
74 Comp_AAN_idx.resize( seq_db.NAAN );
75 make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
76 }
77
78 if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
79 else {seq_db.Read( db_in.c_str(), options );}
80
81 cout << "total seq: " << seq_db.sequences.size() << endl;
82 seq_db.SortDivide( options );
83 seq_db.DoClustering( options );
84
85 printf( "writing new database\n" );
86 if ( options.PE_mode ) { seq_db.WriteClusters( db_in.c_str(), db_in_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
87 else { seq_db.WriteClusters( db_in.c_str(), db_out.c_str(), options ); }
88
89 // write a backup clstr file in case next step crashes
90 seq_db.WriteExtra1D( options );
91 cout << "program completed !" << endl << endl;
92 end_time = current_time();
93 printf( "Total CPU time %.2f\n", end_time - begin_time );
94 return 0;
95 }
96