1 // =============================================================================
2 // CD-HI-EST
3 // http://cd-hit.org/
4 // Cluster Database at High Identity (EST version)
5 // modified from CD-HI
6 //
7 // program written by
8 //                    Weizhong Li
9 //                    UCSD, San Diego Supercomputer Center
10 //                    La Jolla, CA, 92093
11 //                    Email liwz@sdsc.edu
12 //                 at
13 //                    Adam Godzik's lab
14 //                    The Burnham Institute
15 //                    La Jolla, CA, 92037
16 //                    Email adam@burnham-inst.org
17 //
18 // Modified by:
19 //                    Limin Fu
20 //                    Center for Research in Biological Systems (CRBS), UCSD
21 //                    La Jolla, CA, 92093
22 //                    Email: l2fu@ucsd.edu, fu@daovm.net
23 // =============================================================================
24 
25 #include "cdhit-common.h"
26 #include "cdhit-utility.h"
27 //over-write some defs in cd-hi.h for est version
28 #undef MAX_UAA
29 #define MAX_UAA 4
30 
31 //over-write some defs in cd-hi-init.h for est version
32 
33 void setaa_to_na();
34 void make_comp_short_word_index(int NAA, int *NAAN_array, Vector<int> & Comp_AAN_idx);
35 void make_comp_iseq(int len, char *iseq_comp, char *iseq);
36 
37 
38 Options options;
39 SequenceDB seq_db;
40 
41 ////////////////////////////////////  MAIN /////////////////////////////////////
main(int argc,char ** argv)42 int main(int argc, char **argv)
43 {
44 	string db_in;
45 	string db_out;
46 	string db_in_pe;
47 	string db_out_pe;
48 
49 	options.cluster_thd = 0.95;
50 	options.NAA = 10;
51 	options.NAAN = NAA8;
52 	seq_db.NAAN = NAA8;
53 	options.NAA_top_limit = 12;
54 	setaa_to_na();
55 	mat.set_to_na(); //mat.set_gap(-6,-1);
56 
57 	float begin_time = current_time();
58 	float end_time;
59 
60 	// ***********************************    parse command line and open file
61 	if (argc < 5) print_usage_est(argv[0]);
62 	if (options.SetOptions( argc, argv, false, true ) == 0) print_usage_est(argv[0]);
63 	options.Validate();
64 
65 	db_in     = options.input;
66 	db_in_pe  = options.input_pe;
67 	db_out    = options.output;
68 	db_out_pe = options.output_pe;
69 
70 	InitNAA( MAX_UAA );
71 	seq_db.NAAN = NAAN_array[options.NAA];
72 
73 	if ( options.option_r ) {
74 		Comp_AAN_idx.resize( seq_db.NAAN );
75 		make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
76 	}
77 
78         if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
79         else                   {seq_db.Read( db_in.c_str(),                   options );}
80 
81 	cout << "total seq: " << seq_db.sequences.size() << endl;
82 	seq_db.SortDivide( options );
83 	seq_db.DoClustering( options );
84 
85 	printf( "writing new database\n" );
86         if ( options.PE_mode ) { seq_db.WriteClusters( db_in.c_str(), db_in_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
87         else                   { seq_db.WriteClusters( db_in.c_str(),                   db_out.c_str(),                    options ); }
88 
89 	// write a backup clstr file in case next step crashes
90 	seq_db.WriteExtra1D( options );
91 	cout << "program completed !" << endl << endl;
92 	end_time = current_time();
93 	printf( "Total CPU time %.2f\n", end_time - begin_time );
94 	return 0;
95 }
96