1 // =============================================================================
2 // CD-HI-EST
3 // http://cd-hit.org/
4 // Cluster Database at High Identity (EST version)
5 // modified from CD-HI
6 //
7 // program written by
8 //                    Weizhong Li
9 //                    UCSD, San Diego Supercomputer Center
10 //                    La Jolla, CA, 92093
11 //                    Email liwz@sdsc.edu
12 //                 at
13 //                    Adam Godzik's lab
14 //                    The Burnham Institute
15 //                    La Jolla, CA, 92037
16 //                    Email adam@burnham-inst.org
17 //
18 // Modified by:
19 //                    Limin Fu
20 //                    Center for Research in Biological Systems (CRBS), UCSD
21 //                    La Jolla, CA, 92093
22 //                    Email: l2fu@ucsd.edu, fu@daovm.net
23 // =============================================================================
24 
25 #include "cdhit-common.h"
26 #include "cdhit-utility.h"
27 //over-write some defs in cd-hi.h for est version
28 #undef MAX_UAA
29 #define MAX_UAA 4
30 
31 //over-write some defs in cd-hi-init.h for est version
32 
33 void setaa_to_na();
34 void make_comp_iseq(int len, char *iseq_comp, char *iseq);
35 void make_comp_short_word_index(int NAA, int *NAAN_array, Vector<int> & Comp_AAN_idx);
36 
37 Options options;
38 SequenceDB seq_db;
39 SequenceDB seq_db2;
40 
41 // next two control how if seqs in db2 is longer than reps in db1
42 // by deault, only seqs in db2 that are shorter than rep in db1
43 // are clustered to the rep in db1
44 
45 
46 ////////////////////////////////////  MAIN /////////////////////////////////////
main(int argc,char ** argv)47 int main(int argc, char **argv)
48 {
49 	string db_in;
50 	string db_in2;
51 	string db_out;
52         string db_in_pe;
53         string db_in2_pe;
54         string db_out_pe;
55 
56 
57 	options.cluster_thd = 0.95;
58 	options.NAA = 10;
59 	options.NAAN = NAA8;
60 	seq_db.NAAN = NAA8;
61 	options.NAA_top_limit = 12;
62 	setaa_to_na();
63 	mat.set_to_na(); //mat.set_gap(-6,-1);
64 
65 	float begin_time = current_time();
66 	float end_time;
67 
68 	// ***********************************    parse command line and open file
69 	if (argc < 7) print_usage_est_2d(argv[0]);
70 	if (options.SetOptions( argc, argv, true, true ) == 0) print_usage_est_2d(argv[0]);
71 	options.Validate();
72 
73 	db_in = options.input;
74         db_in_pe  = options.input_pe;
75 	db_in2 = options.input2;
76         db_in2_pe = options.input2_pe;
77 	db_out = options.output;
78         db_out_pe = options.output_pe;
79 
80 
81 	InitNAA( MAX_UAA );
82 	options.NAAN = NAAN_array[options.NAA];
83 	seq_db.NAAN = NAAN_array[options.NAA];
84 	seq_db2.NAAN = NAAN_array[options.NAA];
85 
86 	if ( options.option_r ) {
87 		Comp_AAN_idx.resize( seq_db.NAAN );
88 		make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
89 	}
90 
91         if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
92         else                   {seq_db.Read( db_in.c_str(),                   options );}
93 	cout << "total seq in db1: " << seq_db.sequences.size() << endl;
94 
95         if ( options.PE_mode ) { seq_db2.Read( db_in2.c_str(), db_in2_pe.c_str(), options );}
96         else                   { seq_db2.Read( db_in2.c_str(),                    options );}
97 	cout << "total seq in db2: " << seq_db2.sequences.size() << endl;
98 
99 	seq_db.SortDivide( options );
100 	seq_db2.SortDivide( options, false );
101 	seq_db2.ClusterTo( seq_db, options );
102 
103 	cout << "writing non-redundant sequences from db2" << endl;
104 	seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options );
105 
106         if ( options.PE_mode ) { seq_db2.WriteClusters( db_in2.c_str(), db_in2_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
107         else                   { seq_db2.WriteClusters( db_in2.c_str(),                    db_out.c_str(),                    options ); }
108 
109 	seq_db2.WriteExtra2D( seq_db, options );
110 	cout << "program completed !" << endl << endl;
111 	end_time = current_time();
112 	printf( "Total CPU time %.2f\n", end_time - begin_time );
113 	return 0;
114 } // END int main
115 
116