1 // =============================================================================
2 // CD-HI-EST
3 // http://cd-hit.org/
4 // Cluster Database at High Identity (EST version)
5 // modified from CD-HI
6 //
7 // program written by
8 // Weizhong Li
9 // UCSD, San Diego Supercomputer Center
10 // La Jolla, CA, 92093
11 // Email liwz@sdsc.edu
12 // at
13 // Adam Godzik's lab
14 // The Burnham Institute
15 // La Jolla, CA, 92037
16 // Email adam@burnham-inst.org
17 //
18 // Modified by:
19 // Limin Fu
20 // Center for Research in Biological Systems (CRBS), UCSD
21 // La Jolla, CA, 92093
22 // Email: l2fu@ucsd.edu, fu@daovm.net
23 // =============================================================================
24
25 #include "cdhit-common.h"
26 #include "cdhit-utility.h"
27 //over-write some defs in cd-hi.h for est version
28 #undef MAX_UAA
29 #define MAX_UAA 4
30
31 //over-write some defs in cd-hi-init.h for est version
32
33 void setaa_to_na();
34 void make_comp_iseq(int len, char *iseq_comp, char *iseq);
35 void make_comp_short_word_index(int NAA, int *NAAN_array, Vector<int> & Comp_AAN_idx);
36
37 Options options;
38 SequenceDB seq_db;
39 SequenceDB seq_db2;
40
41 // next two control how if seqs in db2 is longer than reps in db1
42 // by deault, only seqs in db2 that are shorter than rep in db1
43 // are clustered to the rep in db1
44
45
46 //////////////////////////////////// MAIN /////////////////////////////////////
main(int argc,char ** argv)47 int main(int argc, char **argv)
48 {
49 string db_in;
50 string db_in2;
51 string db_out;
52 string db_in_pe;
53 string db_in2_pe;
54 string db_out_pe;
55
56
57 options.cluster_thd = 0.95;
58 options.NAA = 10;
59 options.NAAN = NAA8;
60 seq_db.NAAN = NAA8;
61 options.NAA_top_limit = 12;
62 setaa_to_na();
63 mat.set_to_na(); //mat.set_gap(-6,-1);
64
65 float begin_time = current_time();
66 float end_time;
67
68 // *********************************** parse command line and open file
69 if (argc < 7) print_usage_est_2d(argv[0]);
70 if (options.SetOptions( argc, argv, true, true ) == 0) print_usage_est_2d(argv[0]);
71 options.Validate();
72
73 db_in = options.input;
74 db_in_pe = options.input_pe;
75 db_in2 = options.input2;
76 db_in2_pe = options.input2_pe;
77 db_out = options.output;
78 db_out_pe = options.output_pe;
79
80
81 InitNAA( MAX_UAA );
82 options.NAAN = NAAN_array[options.NAA];
83 seq_db.NAAN = NAAN_array[options.NAA];
84 seq_db2.NAAN = NAAN_array[options.NAA];
85
86 if ( options.option_r ) {
87 Comp_AAN_idx.resize( seq_db.NAAN );
88 make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
89 }
90
91 if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
92 else {seq_db.Read( db_in.c_str(), options );}
93 cout << "total seq in db1: " << seq_db.sequences.size() << endl;
94
95 if ( options.PE_mode ) { seq_db2.Read( db_in2.c_str(), db_in2_pe.c_str(), options );}
96 else { seq_db2.Read( db_in2.c_str(), options );}
97 cout << "total seq in db2: " << seq_db2.sequences.size() << endl;
98
99 seq_db.SortDivide( options );
100 seq_db2.SortDivide( options, false );
101 seq_db2.ClusterTo( seq_db, options );
102
103 cout << "writing non-redundant sequences from db2" << endl;
104 seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options );
105
106 if ( options.PE_mode ) { seq_db2.WriteClusters( db_in2.c_str(), db_in2_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
107 else { seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options ); }
108
109 seq_db2.WriteExtra2D( seq_db, options );
110 cout << "program completed !" << endl << endl;
111 end_time = current_time();
112 printf( "Total CPU time %.2f\n", end_time - begin_time );
113 return 0;
114 } // END int main
115
116