1#!/usr/bin/perl -w
2################################################################################
3######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
4################################################################################
5our $pid       = $$;
6our $db_in     = "";   ###################
7our $db_out    = "";   # input / output
8our $len_t     = 10;   ###################
9our $NR_clstr  = 0.3;  #
10our $NR_clstre = -1;   #thresholds
11our $g_iden    = 1;    #
12our $opt_aS    = 0.0;  #
13our $opt_aL    = 0.0;  #
14our $circle    = 0;    #
15our $opt_g     = 1;    ####################
16our $blast_exe = "blastall -p blastp -m 8";               #########################
17our $prof_exe  = "blastpgp -m 8";                         #
18our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500";      #
19our $prof_db   = "";                                      #
20our $bl_para   = "-F T -e 0.000001 -b 100000 -v 100000";  #  program
21our $bl_STDIN  = 1;                                       #
22our $keep_bl   = 0;                                       #
23our $blast_prog= "blastp";                                #
24our $formatdb  = "formatdb";                              #########################
25our $exec_mode = "local";      #######################
26our $num_qsub   = 1;            #
27our $para_no   = 1;            # compute
28our $sh_file   = "";           #
29our $batch_no_per_node = 50;   #######################
30our $reformat_seg = 50000;
31our $restart_seg  = 20000;
32our $job          = "";
33our $job_file     = "";
34our $date         = `date`;
35our $restart_in   = "";
36our $pwd          = `pwd`; chop($pwd);
37our $db_clstr;
38our $db_log;
39our $db_out1;
40our $seq_dir;
41our $bl_dir;
42our $restart_file;
43our $tmp_db;
44our $remote_perl_script;
45our $remote_sh_script;
46our $bl_path;
47our $bl_plus = 1;   #### use blast+
48our $bl_threads = 1;
49our $skip_long = 0;
50our %qsub_ids = (); #### a list of qsub ids
51our %qstat_xml_data = ();
52
53
54sub parse_para_etc {
55  my ($arg, $cmd);
56  while($arg = shift) {
57    ## input/output:
58    if    ($arg eq "-i")          { $db_in     = shift; }
59    elsif ($arg eq "-o")          { $db_out    = shift; }
60    elsif ($arg eq "-l")          { $len_t     = shift; }
61    ## thresholds
62    elsif ($arg eq "-c")          { $NR_clstr  = shift; }
63    elsif ($arg eq "-ce")         { $NR_clstre = shift; }
64    elsif ($arg eq "-G")          { $g_iden    = shift; }
65    elsif ($arg eq "-aL")         { $opt_aL    = shift; }
66    elsif ($arg eq "-aS")         { $opt_aS    = shift; }
67    elsif ($arg eq "-g")          { $opt_g     = shift; }
68    elsif ($arg eq "-circle")     { $circle    = shift; }
69    elsif ($arg eq "-sl")         { $skip_long = shift; }
70    ## program
71    elsif ($arg eq "-prog")       { $blast_prog= shift; }
72    elsif ($arg eq "-p")          { $prof_para = shift; }
73    elsif ($arg eq "-dprof")      { $prof_db   = shift; die "option -dprof no longer supported!";}
74    elsif ($arg eq "-s")          { $bl_para   = shift; }
75    elsif ($arg eq "-k")          { $keep_bl   = shift; }
76    elsif ($arg eq "-bs")         { $bl_STDIN  = shift; }
77    ## compute
78    elsif ($arg eq "-exec")       { $exec_mode = shift; }
79    elsif ($arg eq "-host")       { $num_qsub   = shift; }
80    elsif ($arg eq "-para")       { $para_no   = shift; }
81    elsif ($arg eq "-shf")        { $sh_file   = shift; }
82    elsif ($arg eq "-blp")        { $bl_threads   = shift; }
83    elsif ($arg eq "-bat")        { $batch_no_per_node = shift; }
84    ## job:
85    elsif ($arg eq "-rs")         { $restart_seg = shift; }
86    elsif ($arg eq "-rf")         { $reformat_seg= shift; }
87    elsif ($arg eq "-restart")    { $restart_in= shift;   }
88    elsif ($arg eq "-J")          { $job       = shift; $job_file = shift; }
89    ## blast path
90    elsif ($arg eq "-P")          { $bl_path   = shift;   }
91    else                          { print_usage(); exit(); }
92  }
93
94  # speical jobs
95  if ($job eq "parse_blout") { job_parse_blout(); exit();}
96
97  if ($blast_prog eq "blastn") {
98    $formatdb  = "formatdb -p F";
99    $blast_exe    = "blastall -p blastn -m 8";
100  }
101  elsif ($blast_prog eq "megablast") {
102    $blast_prog = "blastn"; #### back to blastn for blast parser type
103    $formatdb  = "formatdb -p F";
104    $blast_exe    = "megablast -H 100 -D 2 -m 8";
105  }
106  elsif ($blast_prog eq "blastpgp") {
107    $blast_exe  = "blastpgp -m 8 -j 3";
108  }
109
110  #### for blast+
111  if ($bl_plus) {
112    $formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
113    $blast_exe = "blastp -outfmt 6";
114    $bl_para   = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads";  #  program
115
116    if ($blast_prog eq "blastn") {
117      $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
118      $blast_exe    = "blastp -task blastn -outfmt 6";
119      $bl_para   = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads";  #  program
120    }
121    elsif ($blast_prog eq "megablast") {
122      $blast_prog = "blastn"; #### back to blastn for blast parser type
123      $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
124      $blast_exe    = "blastp -task megablast -outfmt 6";
125      $bl_para   = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads";  #  program
126    }
127    elsif ($blast_prog eq "blastpgp") {
128      $blast_exe  = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
129    }
130  }
131
132  if ($bl_path) {
133    $blast_exe = "$bl_path/$blast_exe";
134    $formatdb  = "$bl_path/$formatdb";
135  }
136
137  (-e $db_in) || die "No input";
138  ($db_out)   || die "No output";
139
140  $db_clstr  = "$db_out.clstr";
141  $db_log    = "$db_out.log";
142  $db_out1   = "$db_out.out";
143  $seq_dir   = "$db_in-seq";
144  $bl_dir    = "$db_in-bl";
145  $restart_file   =" $db_out.restart";
146
147  $tmp_db    = "$db_in.$pid";
148  $remote_perl_script = "$tmp_db-bl.pl";
149  $remote_sh_script   = "$tmp_db-bl.sh";
150
151  $cmd = `mkdir $bl_dir $seq_dir`;
152
153  write_remote_perl_script();
154  write_remote_sh_script();
155  return;
156}
157########## END parse_para_etc
158
159
160sub read_db {
161  my $des = "";
162  my $seq = "";
163  my $ll;
164
165  open(DBIN, $db_in)         || die "Can not open $db_in";
166  while($ll=<DBIN>){
167    chop($ll);
168    if ($ll =~ /^>/) {
169      $seq =~ s/\s//g;
170      if (length($seq) > $len_t) { add_seq($des, $seq); }
171      $des = $ll; $seq = "";
172
173    }
174    else { $seq .= $ll; }
175  }
176  $seq =~ s/\s//g;
177  if (length($seq) > $len_t) { add_seq($des, $seq); }
178  close(DBIN);
179
180  ($NR_no >=1 ) || die "No sequence readin";
181
182  print OUTT "Total seqs $NR_no in $db_in\n";
183  return;
184}
185########## END read_db
186
187
188sub add_seq {
189  my ($des, $seq) = @_;
190  $des =~ s/\s.+$//;
191  push(@seqs,   $seq);
192  push(@dess,   $des);
193  push(@lens,   length($seq));
194  push(@idens,  0);
195  push(@passeds,0);
196  push(@NR_clstr_nos,0);
197  push(@in_bg, 0);
198  $NR_no++;
199  return;
200}
201########## END add_seq
202
203
204sub open_LOG {
205  open(OUTT, ">> $db_out1") || die "can not open $db_out1";
206  select(OUTT); $|++; ### file handle flush
207  print OUTT "Started $date";
208
209  open(LOG, ">> $db_log")     || die "Can not open $db_log";
210  select(LOG); $|++; ### file handle flush
211  select(STDOUT);
212  return;
213}
214########## END open_LOG
215
216sub write_LOG {
217  my $txt=shift;
218  print LOG "$txt\n";
219}
220
221{## use static variables
222my $last_NR90_no=0;
223my $last_NR_passed=0;
224sub watch_progress {
225  my ($i0, $NR90_no, $NR_passed, $NR_no, $flag) = @_;
226  my $i1 = $i0+1;
227
228  if ( $i1 % 10 == 0 ) {
229    print OUTT ".";
230    $flag = 1 if ( $i1 % 100 == 0 );
231  }
232
233  if ($flag) {
234    my $t1 = (int($NR_passed/$NR_no*10000)) / 100;
235    my $t90 = $NR90_no - $last_NR90_no;
236    my $tno = $NR_passed - $last_NR_passed;
237    my ($tu, $ts, $cu, $cs) = times();
238    my $tt = $tu + $ts + $cu + $cs;
239    print OUTT
240      "$i1 finished $NR90_no clusters $NR_passed passed $t90/$tno clstr/passed $t1% done $tt cpu\n";
241    $last_NR90_no = $NR90_no;
242    $last_NR_passed = $NR_passed;
243  }
244  return;
245}
246}
247
248
249sub close_LOG {
250  my $date = `date`; print OUTT "Completed $date\n";
251  my $total_cpu = total_remote_cpu();
252  print OUTT "Total CPUs on remote hosts: $total_cpu\n";
253  close(OUTT);
254  close(LOG);
255  return;
256}
257########## END close_LOG
258
259###### need to change to read dir because
260sub total_remote_cpu {
261  my ($i, $j, $k, $ll);
262  my $tt = 0;
263  for ($j=0; $j<$num_qsub; $j++) {
264    open(TCPU, "$seq_dir/host.$j.cpu") || next;
265    while($ll = <TCPU>) {
266      chop($ll);
267      $tt += $ll;
268    }
269    close(TCPU);
270  }
271  return $tt;
272}
273########## END total_remote_cpu
274
275
276sub job_parse_blout {
277  my ($i, $j, $k);
278  my @hits = process_blout_blastp_blastn($job_file);
279
280  open(BLOUT2, "> $job_file.out") || return;
281  foreach $i (@hits) {
282    print BLOUT2 join("\t", @{$i}), "\n";
283  }
284  print BLOUT2 "#\n";
285  close(BLOUT2);
286  return;
287}
288########## END job_parse_blout
289
290
291sub write_restart {
292  my ($i0, $i, $j, $k);
293  open(RES, "> $restart_file") || die;
294
295  for ($i0=0; $i0<$NR_no; $i0++) {
296    $i = $NR_idx[$i0];
297    print RES "$i\t$NR_clstr_nos[$i]\t$idens[$i]\t$passeds[$i]\n";
298  }
299
300  close(RES);
301  return;
302}
303########## END write_restart
304
305
306sub read_restart {
307  my ($ii, $i0, $i, $j, $k, $ll);
308  my @lls;
309  open(RESIN, $restart_in) || die;
310
311  $NR_passed = 0;
312  $NR90_no   = 0;
313  $ii = -1;
314  $i0 = 0;
315  while($ll = <RESIN>) {
316    chop($ll);
317    @lls = split(/\t/,$ll);
318    $i = $lls[0];
319    $NR_clstr_nos[$i] = $lls[1];
320    $idens[$i]       = $lls[2];
321    $passeds[$i]     = $lls[3];
322    $NR_passed++   if ($lls[3]);
323
324    if ($lls[2] eq "*") { #rep
325      $NR90_no++;
326      $ii = $i0 if ($lls[3]);
327    }
328    $NR_idx[$i0] = $i;
329    $i0++; # idx of sorted , see write_restart
330  }
331  close(RESIN);
332
333  $ii++; # $ii to be last rep processed
334  return $ii;
335}
336########## END read_restart
337
338
339sub write_db_clstr {
340  my ($i0, $i, $j, $k);
341
342  my @NR90_seq = ();
343  for ($i=0; $i<$NR90_no; $i++) { $NR90_seq[$i] = []; }
344  for ($i0=0; $i0<$NR_no; $i0++) {
345    $i = $NR_idx[$i0];
346    next unless ($passeds[$i]);
347    $j = $NR_clstr_nos[$i];
348    next unless ($j < $NR90_no);
349    push(@{$NR90_seq[$j]}, $i);
350  }
351
352  open(DBCLS, "> $db_clstr") || die "Can not write $db_clstr";
353  for ($i=0; $i<$NR90_no; $i++) {
354    print DBCLS ">Cluster $i\n";
355    $k = 0;
356    foreach $j (@{ $NR90_seq[$i] }) {
357      my $des = (split(/\s+/,$dess[$j]))[0];
358      print DBCLS "$k\t$lens[$j]"."aa, $des... ";
359      if ($idens[$j] eq "*") { print DBCLS "*\n"; }
360      else                   { print DBCLS "at $idens[$j]\n";}
361      $k++;
362    }
363  }
364  close(DBCLS);
365
366  @NR90_seq=();
367  return;
368}
369########## END write_db_clstr
370
371
372sub remove_raw_blout {
373  my $NR_sofar = shift;
374  my ($i0, $i, $j, $k, $cmd);
375  return if ($keep_bl);
376
377  for ($i0=$NR_sofar; $i0>=0; $i0--) {
378    $i = $NR_idx[$i0];
379    next unless $passeds[$i];
380    next unless ($idens[$i] eq "*"); #only reps have blout
381    my $fout = "$bl_dir/$i";
382    last unless (-e "$fout.out");    #removed from last call
383    if (not $bl_STDIN) { $cmd = `rm -f $fout`; }
384    $cmd = `rm -f $bl_dir/$i.out`;
385  }
386  return;
387}
388########## END remove_raw_blout
389
390
391sub remove_raw_blout_bg {
392  my $NR_sofar = shift;
393  my ($i0, $i, $j, $k, $cmd);
394  return if ($keep_bl);
395
396  my $tmp_sh_script   = "$tmp_db-rm-$NR_sofar.sh";
397  open(OUTRM, ">$tmp_sh_script") || die "can not write to $tmp_sh_script";
398
399  for ($i0=$NR_sofar; $i0>=0; $i0--) {
400    $i = $NR_idx[$i0];
401    next unless $passeds[$i];
402    next unless ($idens[$i] eq "*"); #only reps have blout
403    my $fout = "$bl_dir/$i";
404    last unless (-e "$fout.out");    #removed from last call
405    if (not $bl_STDIN) { print OUTRM "rm -f $fout\n"; }
406    print OUTRM "rm -f $bl_dir/$i.out";
407  }
408  print OUTRM "rm -f $tmp_sh_script\n"; ## remove self
409  close(OUTRM);
410  sleep(3);
411
412  $cmd = `sh $tmp_sh_script >/dev/null 2>&1 &`;
413  return;
414}
415########## END remove_raw_blout_bg
416
417
418sub fish_other_homolog {
419  my ($i, $j, $k, $i0, $j0, $k0);
420  $id = shift; # real idx, not sorted idx
421  my @hits = ();
422
423  wait_blast_out("$bl_dir/$id.out");
424  open(BLPOUT, "$bl_dir/$id.out") || return;
425  while($i=<BLPOUT>) {
426    last if ($i =~ /^#/);
427    chop($i);
428    push(@hits, [split(/\t/,$i)]);
429  }
430  close(BLPOUT);
431  my $rep_len = $lens[$id];
432
433  foreach $i (@hits) {
434    my $id1 = $i->[0];
435    next unless ($id1 < $NR_no);
436    next if ($idens[$id1] eq "*"); #existing reps
437    next if ($lens[$id1] > $rep_len); # in opt_g=1 mode, preventing it from being clustered into short rep
438
439    if ( $passeds[$id1] ) { #### if this hit is better -g 1 mode
440      my $old_e = (split(/\//,$idens[$id1]))[0];
441      if ($i->[3] < $old_e) {
442        $idens[$id1]   = "$i->[3]/$i->[2]aa/$i->[1]%";
443        $passeds[$id1] = 1;
444        $NR_clstr_nos[$id1] = $NR90_no;
445      }
446      next;
447    }
448
449    $idens[$id1]   = "$i->[3]/$i->[2]aa/$i->[1]%";
450    $passeds[$id1] = 1;
451    $NR_clstr_nos[$id1] = $NR90_no;
452    $NR_passed++;
453  }
454  return;
455}
456########## END fish_other_homolog
457
458
459########### if a hit has multiple HSPs on both + - strands
460########### keep only the HSPs, whose strand is same as the top HSP
461sub keep_strand_with_top_hsp {
462  my $self = shift;
463  my ($i,$j,$k);
464
465  my %id_2_strand = ();
466  my @new_sbj = ();
467  my $new_no = 0;
468  for ($i=0; $i<$self->{no}; $i++) {
469    my $p = $self->{sbj}->[$i];
470    my ($id1, $len_sub) = split(/\./, $p->{id});
471    if (not defined($id_2_strand{$id1})) {
472      $id_2_strand{$id1} = $p->{frame};
473    }
474    if ($p->{frame} eq $id_2_strand{$id1}) { #### this stand is same as the top strand
475      push(@new_sbj, $self->{sbj}->[$i]);
476      $new_no++;
477    }
478  }
479  $self->{no} = $new_no;
480  $self->{sbj} = [@new_sbj];
481}
482########## END keep_strand_with_top_hsp
483
484########## for blastpgp -j no (no>1)
485########## keep hits from the last round
486sub keep_hsp_of_last_round {
487  my $self = shift;
488  my ($i,$j,$k);
489
490  my @new_sbj = ();
491  my $new_no  = 0;
492  my $last_score = 9999999*9999999*9999999; # a big one
493  for ($i=0; $i<$self->{no}; $i++) {
494    my $p = $self->{sbj}->[$i];
495    my $score = $p->{score};
496
497    if ($score > $last_score) { ## this is new round of hits
498      @new_sbj = ();
499      $new_no  = 0;
500    }
501    $last_score = $score;
502    push(@new_sbj, $self->{sbj}->[$i]);
503    $new_no++;
504  }
505  $self->{no} = $new_no;
506  $self->{sbj} = [@new_sbj];
507}
508########## END keep_hsp_of_last_round
509
510########## if a query hit a subject with multiple HSPs
511########## only the top HSP is kept
512sub keep_top_hsp {
513  my $self = shift;
514  my ($i,$j,$k);
515
516  my %id_exist = ();
517  my @new_sbj = ();
518  my $new_no = 0;
519  for ($i=0; $i<$self->{no}; $i++) {
520    my $p = $self->{sbj}->[$i];
521    my ($id1, $len_sub) = split(/\./, $p->{id});
522    next unless ($len_sub >0) ;
523
524    if (not defined($id_exist{$id1})) {
525      $id_exist{$id1} = 1;
526      push(@new_sbj, $self->{sbj}->[$i]);
527      $new_no++;
528    }
529  }
530  $self->{no} = $new_no;
531  $self->{sbj} = [@new_sbj];
532}
533########## keep_top_hsp
534
535########## let the top hsp to start at 0 for both query and subject
536########## i.e. the begining of HSP to be new original - coordinate 0
537########## then reset all other HSPs' alignment coordinates
538sub reset_alignment_coor_for_circle_seq {
539  my $self = shift;
540  my ($i,$j,$k);
541
542  my $last_id = "";
543  $j = 0;
544  my $hsp_count = 0; # number of HSPs for a subject
545  for ($i=0; $i<$self->{no}; $i++) {
546    my $p = $self->{sbj}->[$i];
547    my ($id1, $len_sub) = split(/\./, $p->{id});
548
549    if ($id1 ne $last_id) {
550      if ($hsp_count > 1) {  # it is necessary to reset coordinate when at least 2 HSP
551        my $p_top_hsp = $self->{sbj}->[$j];
552        my $len_q = (split(/\./, $p_top_hsp->{qid}))[1];
553        my $len_s = (split(/\./, $p_top_hsp->{id}))[1];
554        my $ref_q = ($p_top_hsp->{qfrom} < $p_top_hsp->{qend}) ? $p_top_hsp->{qfrom} : $p_top_hsp->{qend};
555        my $ref_s = ($p_top_hsp->{sfrom} < $p_top_hsp->{send}) ? $p_top_hsp->{sfrom} : $p_top_hsp->{send};
556        for ($k = $j; $k<$j+$hsp_count; $k++) {
557          $self->{sbj}->[$k]->{qfrom} -= $ref_q; if ($self->{sbj}->[$k]->{qfrom} < 0) {$self->{sbj}->[$k]->{qfrom} += $len_q;}
558          $self->{sbj}->[$k]->{qend}  -= $ref_q; if ($self->{sbj}->[$k]->{qend}  < 0) {$self->{sbj}->[$k]->{qend}  += $len_q;}
559          $self->{sbj}->[$k]->{sfrom} -= $ref_s; if ($self->{sbj}->[$k]->{sfrom} < 0) {$self->{sbj}->[$k]->{sfrom} += $len_s;}
560          $self->{sbj}->[$k]->{send}  -= $ref_s; if ($self->{sbj}->[$k]->{send}  < 0) {$self->{sbj}->[$k]->{send}  += $len_s;}
561        }
562      }
563      $j = $i;
564      $hsp_count = 0;
565    }
566    $last_id = $id1;
567    $hsp_count++;
568  }
569
570      #last subject
571      if ($hsp_count > 1) {  # it is necessary to reset coordinate when at least 2 HSP
572        my $p_top_hsp = $self->{sbj}->[$j];
573        my $len_q = (split(/\./, $p_top_hsp->{qid}))[1];
574        my $len_s = (split(/\./, $p_top_hsp->{id}))[1];
575        my $ref_q = ($p_top_hsp->{qfrom} < $p_top_hsp->{qend}) ? $p_top_hsp->{qfrom} : $p_top_hsp->{qend};
576        my $ref_s = ($p_top_hsp->{sfrom} < $p_top_hsp->{send}) ? $p_top_hsp->{sfrom} : $p_top_hsp->{send};
577        for ($k = $j; $k<$j+$hsp_count; $k++) {
578          $self->{sbj}->[$k]->{qfrom} -= $ref_q; if ($self->{sbj}->[$k]->{qfrom} < 0) {$self->{sbj}->[$k]->{qfrom} += $len_q;}
579          $self->{sbj}->[$k]->{qend}  -= $ref_q; if ($self->{sbj}->[$k]->{qend}  < 0) {$self->{sbj}->[$k]->{qend}  += $len_q;}
580          $self->{sbj}->[$k]->{sfrom} -= $ref_s; if ($self->{sbj}->[$k]->{sfrom} < 0) {$self->{sbj}->[$k]->{sfrom} += $len_s;}
581          $self->{sbj}->[$k]->{send}  -= $ref_s; if ($self->{sbj}->[$k]->{send}  < 0) {$self->{sbj}->[$k]->{send}  += $len_s;}
582        }
583      }
584
585  return;
586}
587########## reset_alignment_coor_for_circle_seq
588
589
590sub process_blout_blastp_blastn {
591  my ($i, $j, $k, $i0, $j0, $k0);
592  my $blout = shift;
593  my @blhits = ();
594
595  #### need $len_rep
596  my $len_rep = 0;
597  my $bl = readblast_m8("", $blout);
598  if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); }
599  if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); }
600
601  if ($g_iden == 0 ) { #### Local identity
602    keep_top_hsp($bl); #### local alignment, only the top HSP
603
604    for ($i=0; $i<$bl->{no}; $i++) {
605      my $p = $bl->{sbj}->[$i];
606      my ($id1, $len_sub) = split(/\./, $p->{id});
607      my $frame = $p->{frame};
608      if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
609      my $iden       = $p->{iden};
610      next unless (($len_sub >0) and ($len_rep>0));
611      my $cov_aS     = $p->{alnln} / $len_sub;
612      my $cov_aL     = $p->{alnln} / $len_rep;
613      my $exp1       = $p->{expect};
614
615      if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) {
616        push(@blhits, [$id1, $iden, $p->{alnln}, $exp1, $frame]);
617      }
618    }
619    return @blhits;
620  } #### END if ($g_iden == 0 )
621  else { #### Global idnetity
622    if (($blast_prog eq "blastn") and $circle) { reset_alignment_coor_for_circle_seq($bl); }
623    #### get colinear non-overlapping HSPs
624    my @hsp = (); #### [id, len, qfrom, qend, sbegin, send, expect]
625    my $iden_letters = 0;
626    my $aln_letters = 0;
627    my @aln_lens = ();
628    my $hsp_no = 0;
629    for ($i=0; $i<$bl->{no}; $i++) {
630      my $p = $bl->{sbj}->[$i];
631      my ($id1, $len_sub) = split(/\./, $p->{id});
632      my $frame = $p->{frame};
633      if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; }
634      next unless (($len_sub >0) and ($len_rep>0));
635
636      if ($hsp_no) {
637        if ($id1 ne $hsp[0]->[0]) {
638          #### 1. parse previous subject's HSPs
639          my $iden       = int($iden_letters / $hsp[0]->[1] * 10000)/100;
640          my $cov_aS     = $aln_letters  / $hsp[0]->[1];
641          my $cov_aL     = $aln_letters  / $len_rep;
642          my $exp1       = $hsp[0]->[6];
643          my $frame      = $hsp[0]->[7];
644
645          if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) {
646            #push(@blhits, [$hsp[0]->[0], $iden, $aln_letters, $exp1, $frame]);
647            push(@blhits, [$hsp[0]->[0], $iden, join(":", @aln_lens), $exp1, $frame]);
648          }
649          #### 2. init some values
650          @hsp = ();
651          $iden_letters = 0;
652          $aln_letters = 0;
653          @aln_lens = ();
654          $hsp_no = 0;
655        }
656      }
657
658      #check whether overlap with previous high score HSPs
659      my $overlap_flag = 0;
660      for ($j=0; $j<$hsp_no; $j++) {
661        if (overlap1($p->{qfrom}, $p->{qend}, $hsp[$j]->[2], $hsp[$j]->[3])) { $overlap_flag = 1; last; }
662        if (overlap1($p->{sfrom}, $p->{send}, $hsp[$j]->[4], $hsp[$j]->[5])) { $overlap_flag = 1; last; }
663      }
664      next if ($overlap_flag);
665
666      #check whether this HSP cross with previous high score HSPs
667      my $cross_flag = 0;
668      for ($j=0; $j<$hsp_no; $j++) {
669        if (cross1($p->{qfrom}, $p->{qend}, $hsp[$j]->[2], $hsp[$j]->[3],
670                   $p->{sfrom}, $p->{send}, $hsp[$j]->[4], $hsp[$j]->[5])) {
671          $cross_flag = 1; last;
672        }
673      }
674      next if ($cross_flag);
675
676      push(@hsp, [$id1, $len_sub, $p->{qfrom}, $p->{qend}, $p->{sfrom}, $p->{send}, $p->{expect}, $p->{frame}]);
677      $iden_letters += int($p->{iden} * $p->{alnln} / 100);
678      $aln_letters += $p->{alnln};
679      push(@aln_lens, $p->{alnln});
680      $hsp_no++;
681    }
682
683      if ($hsp_no) { #last record
684        #### 1. parse previous subject's HSPs
685        my $iden       = int($iden_letters / $hsp[0]->[1] * 10000)/100;
686        my $cov_aS     = $aln_letters  / $hsp[0]->[1];
687        my $cov_aL     = $aln_letters  / $len_rep;
688        my $exp1       = $hsp[0]->[6];
689        my $frame      = $hsp[0]->[7];
690
691        if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) {
692          #push(@blhits, [$hsp[0]->[0], $iden, $aln_letters, $exp1, $frame]);
693          push(@blhits, [$hsp[0]->[0], $iden, join(":", @aln_lens), $exp1, $frame]);
694        }
695      }
696
697    return @blhits;
698  }
699}
700########## END process_blout_blastp_blastn
701
702
703sub overlap1 {
704  my ($b1, $e1, $b2, $e2) = @_;
705
706  my $t; ###
707  if ($e1 < $b1) { $t  = $e1; $e1 = $b1; $b1 = $t; }
708  if ($e2 < $b2) { $t  = $e2; $e2 = $b2; $b2 = $t; }
709
710  return 0 if ($e2 < $b1);
711  return 0 if ($b2 > $e1);
712  return ( ($e1<$e2)? $e1:$e2 )-( ($b1>$b2)? $b1:$b2);
713}
714########## END overlap1
715
716## modified on 2013_0818 to hancle +- frames
717sub cross1 {
718  my ($q_b1, $q_e1, $q_b2, $q_e2,
719      $s_b1, $s_e1, $s_b2, $s_e2) = @_;
720
721  my $fr_q1 = ($q_b1 < $q_e1) ? 1 : -1;
722  my $fr_q2 = ($q_b2 < $q_e2) ? 1 : -1;
723  my $fr_s1 = ($s_b1 < $s_e1) ? 1 : -1;
724  my $fr_s2 = ($s_b2 < $s_e2) ? 1 : -1;
725
726  my $fr1 = $fr_q1 * $fr_s1;
727  my $fr2 = $fr_q2 * $fr_s2;
728  return 1 if (($fr1 * $fr2) < 0); # one ++ and one +-
729
730  my $t;
731  if ($q_e1 < $q_b1) { $t    = $q_e1; $q_e1 = $q_b1; $q_b1 = $t; }
732  if ($q_e2 < $q_b2) { $t    = $q_e2; $q_e2 = $q_b2; $q_b2 = $t; }
733  if ($s_e1 < $s_b1) { $t    = $s_e1; $s_e1 = $s_b1; $s_b1 = $t; }
734  if ($s_e2 < $s_b2) { $t    = $s_e2; $s_e2 = $s_b2; $s_b2 = $t; }
735
736# after above transformation
737#                               0           q_b1           q_e1             q_b2        q_e2       qlen
738# query                      5' ====================================================================
739# match                                     ||||||||||||||||                |||||||||||||
740# subject                         5' ========================================================================>>>>>> frame +
741#                                    0      s_b1           s_e1             s_b2        s_e2                 slen
742
743# match                                     ||||||||||||||||                |||||||||||||
744# subject                         3' ========================================================================>>>>>> frame -
745#                                    slen   s_e1           s_b1             s_e2        s_b2                 0
746
747  if (($fr1 > 0) and ($fr2>0)) { # both ++
748    return ( (($q_b2-$q_b1)*($s_b2-$s_b1) <0) ? 1 : 0);
749  }
750  else { # both --
751    return ( (($q_b2-$q_b1)*($s_e1-$s_e2) <0) ? 1 : 0);
752  }
753
754}
755########## END cross1
756
757
758## modified on 2013_0818 to hancle +- frames
759sub cross1_before_2013_0818 {
760  my ($q_b1, $q_e1, $q_b2, $q_e2,
761      $s_b1, $s_e1, $s_b2, $s_e2) = @_;
762
763  my $t;
764  if ($q_e1 < $q_b1) { $t    = $q_e1; $q_e1 = $q_b1; $q_b1 = $t; }
765  if ($q_e2 < $q_b2) { $t    = $q_e2; $q_e2 = $q_b2; $q_b2 = $t; }
766  if ($s_e1 < $s_b1) { $t    = $s_e1; $s_e1 = $s_b1; $s_b1 = $t; }
767  if ($s_e2 < $s_b2) { $t    = $s_e2; $s_e2 = $s_b2; $s_b2 = $t; }
768
769  return ( (($q_b2-$q_b1)*($s_b2-$s_b1) <0) ? 1 : 0);
770}
771########## END cross1
772
773sub readblast_m8 {
774  my ($i, $j, $k, $ll, $no);
775  my ($q_seq, $filename) = @_;
776
777
778  my $fh  = "BL" ;
779  if ($bl_STDIN) {      $fh = "STDIN"; }
780  else           { open($fh, $filename) || return; }
781
782  my @this_sbj = ();
783  $no = 0;
784  while($ll = <$fh>) {
785    chop($ll);
786    my @lls = split(/\t/,$ll);
787    my $frame = "";
788       $frame .= ($lls[6] < $lls[7]) ? "+" : "-";
789       $frame .= ($lls[8] < $lls[9]) ? "+" : "-";
790    next unless ($lls[0] and $lls[1]);
791    $this_sbj[$no] = {
792      'qid'     => $lls[0],
793      'id'      => $lls[1],
794      'iden'    => $lls[2],
795      'alnln'   => $lls[3],
796      'ms'      => $lls[4],
797      'gap'     => $lls[5],
798      'qfrom'   => $lls[6],
799      'qend'    => $lls[7],
800      'sfrom'   => $lls[8],
801      'send'    => $lls[9],
802      'expect'  => $lls[10],
803      'score'   => $lls[11],
804      'frame'   => $frame,
805    };
806
807    $no++;
808# BLASTP 2.2.24 [Aug-08-2010]
809# Query: gi|388328107|pdb|4DDG|A Chain A, Crystal Structure Of Human Otub1UBCH5B~UBUB
810# Database: pdbaa.fa
811# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
812#gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 91.81   171     9       3       6       171     1       171     6e-89    323
813#gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 96.51   86      3       0       235     320     155     240     2e-41    166
814  }
815  close($fh) if (not $bl_STDIN);
816
817  my $self = {
818    'no'  => $no,
819    'sbj' => [@this_sbj],
820  };
821  return $self;
822}
823########## END readblast_m8
824
825
826sub blast_formatdb {
827  my ($i0, $i, $j, $k, $len1);
828
829  open(FDB, "> $tmp_db") || die;
830  $j = 0;
831  $len1 = 0;
832  for ($i0=$NR_no-1; $i0>=0; $i0--) { ### from shortest to longest
833    $i = $NR_idx[$i0];
834    last if ($idens[$i] eq "*"); ### last if reach rep
835    next if ($lens[$i] < $opt_aL_lower_band);
836    next if ($passeds[$i] and ($opt_g==0));
837    my $seq = $seqs[$i];
838    $seq =~ s/(.{70})/$1\n/g;
839    $seq =~ s/\n$//;
840    #print FDB ">$i $dess[$i]\n$seq\n";
841    print FDB ">$i.$lens[$i]\n$seq\n";
842    $j++;
843    $len1 += $lens[$i];
844  }
845  close(FDB);
846
847  while(1) {
848    opendir(SEQDB, $seq_dir) || next;
849    my @leftseqs = grep {/lock/} readdir(SEQDB);
850    closedir(SEQDB);
851
852    last unless @leftseqs;
853    sleep(3);
854  }
855
856  return(0, 0) unless ($j > 0);
857
858  my $cmd_line = "$formatdb -i $tmp_db";
859     $cmd_line = "$formatdb -in $tmp_db" if ($bl_plus);
860  my $cmd = `$cmd_line`;
861
862  ((-e "$tmp_db.phr") and (-e "$tmp_db.pin") and (-e "$tmp_db.psq")) ||
863  ((-e "$tmp_db.nhr") and (-e "$tmp_db.nin") and (-e "$tmp_db.nsq")) ||
864  ((-e "$tmp_db.00.phr") and (-e "$tmp_db.00.pin") and (-e "$tmp_db.00.psq")) ||
865  ((-e "$tmp_db.00.nhr") and (-e "$tmp_db.00.nin") and (-e "$tmp_db.00.nsq"))
866     || die "Can not formatdb";
867
868  return($j, $len1);
869}
870########## END blast_formatdb
871
872
873sub remove_blast_db {
874  my ($i, $j, $k);
875  $cmd = `rm -f $tmp_db`;
876  $cmd = `rm -f $tmp_db.p*`;
877  $cmd = `rm -f $tmp_db.n*`;
878
879  return;
880}
881########## END remove_blast_db
882
883
884my $common_usage = <<EOD;
885
886Options
887  input/output:
888    -i  in_dbname, required
889    -o  out_dbname, required
890    -l  length_of_throw_away_sequences, default 10
891
892  thresholds:
893    -c  clustering threshold (sequence identity), default 0.3
894    -ce clustering threshold (blast expect), default -1,
895        it means by default it doesn't use expect threshold,
896        but with positive value, the program cluster seqs if similarities
897        meet either identity threshold or expect threshold
898    -G  (1/0) use global identity? default 1
899        two sequences Long (i.e. representative) and Short (redunant) may have multiple
900        alignment fragments (i.e. HSPs), see:
901        seq1  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx   Long sequence
902                   ||||||||||||||||||             /////////////          i.e. representative
903                   ||||||||||||||||||            /////////////                sequence
904                   ||||||||HSP 1 ||||           ////HSP 2 ///
905                   ||||||||||||||||||          /////////////
906                   ||||||||||||||||||         /////////////
907        seq2    xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx          Short sequence
908                   <<  length 1    >>        <<   len 2 >>               i.e. redundant
909                <<<<<<<<<<<< length of short sequence >>>>>>>>>>>>>>          sequence
910
911                          total identical letters from all co-linear and non-overlapping HSPs
912        Glogal identity = -------------------------------------------------------------------
913                                        length of short sequence
914        Local identity  = identity of the top high score HSP
915        if you prefer to use -G 0, it is suggested that you also
916        use -aS, -aL, such as -aS 0.8, to prevent very short matches.
917    -aL	alignment coverage for the longer sequence, default 0.0
918 	if set to 0.9, the alignment must covers 90% of the sequence
919    -aS	alignment coverage for the shorter sequence, default 0.0
920 	if set to 0.9, the alignment must covers 90% of the sequence
921    -g  (1/0), default 0
922        by cd-hit's default algorithm, a sequence is clustered to the first
923        cluster that meet the threshold (fast cluster). If set to 1, the program
924        will cluster it into the most similar cluster that meet the threshold
925        (accurate but slow mode)
926        but either 1 or 0 won't change the representatives of final clusters
927    -circle (1/0), default 0
928        when set to 1, treat sequences as circular sequence.
929        bacterial genomes, plasmids are circular, but their genome coordinate maybe arbitary,
930        the 2 HSPs below will be treated as non co-linear with -circle 0
931        the 2 HSPs below will be treated as     co-linear with -circle 1
932              -------------circle-----------
933              |                            |
934        seq1  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx      genome / plasmid 1
935               \\\\\\\\      /////////////
936                \\\\\\\\    /////////////
937                  HSP 2 -> ////HSP 1 ///   <-HSP 2
938                          /////////////     \\\\\\\\
939                         /////////////       \\\\\\\\
940        seq2           xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx   genome / plasmid 2
941                       |                             |
942                       -----------circle--------------
943    -sl, length of very long sequences to be skipped, default 0, no skipping
944        e.g. -sl 5000 means sequences longer than 5000 aa will be treated as singleton clusters
945             without clustering, to save time, especially when there is -aL option in place, very
946             long sequences will not be clustered anyway.
947   program:
948     -prog (blastp, blastn, megablast, blastpgp), default blastp
949     -p  profile search para, default
950           "-j 3 -F F -e 0.001 -b 500 -v 500"
951     -dprof database for building PSSM, default using input
952         you can also use another database that is more comprehensive like NR80
953     -s  blast search para, default
954           "-F F -e 0.000001 -b 100000 -v 100000"
955     -bs (1/0) default 1
956        pipe blast results from into parser instead of save in hard drive (save time)
957
958   compute:
959     -exec (qsub, local) default local
960          this program writes a shell script to run blast, this script is
961          either performed locally by sh or remotely by qsub
962          with qsub, you can use PBS, SGE etc
963     -host number of hosts, ie number of qsub jobs
964     -para number of parallel blast job per qsub job (each blast can use multi cores), default 1
965     -blp  number of threads per  blast job, default 1
966           number of threads per blast job X number of parallel blast job per qsub job
967           should <= the number of cores in your computer
968           if your computer grid has 32 cores / node, do either of the followings
969           -para 4  -blp 8
970           -para 8  -blp 4
971           -para 16 -blp 2
972           -para 32 -blp 1
973     -bat number of sequences a blast job to process
974     -shf a filename for add local settings into the job shell script
975          for example, when you run PBS jobs, you can add quene name etc in this
976          file and this script will add them into the job shell script
977e.g. template file for PBS
978#!/bin/sh
979#PBS -v PATH
980#PBS -l walltime=8:00:00
981#PBS -q job_queue.q
982
983e.g. template file for SGE or OGE
984#!/bin/sh
985#\$ -v PATH
986#\$ -q job_queue.q
987#\$ -V
988#\$ -pe orte 8
989
990  job:
991    -rs steps of save restart file and clustering output, default 5000
992        everytime after process 5000 sequences, program write a
993        restart file and current clustering information
994    -restart restart file, readin a restart file
995        if program crash, stoped, termitated, you can restart it by
996        add a option "-restart sth.restart"
997    -rf steps of re format blast database, default 200,000
998        if program clustered 200,000 seqs, it remove them from seq
999        pool, and re format blast db to save time
1000    -J  job, job_file, exe specific jobs like parse blast outonly
1001        DO NOT use it, it is only used by this program itself
1002    -k (1/0) keep blast raw output file, default $keep_bl
1003
1004    -P path to executables
1005EOD
1006
1007
1008sub print_usage {
1009  print <<EOD;
1010Usage psi-cd-hit [Options]
1011$common_usage
1012
1013                                    ==============================
1014                                    by Weizhong Li, liwz\@sdsc.edu
1015                                    ==============================
1016    If you find cd-hit useful, please kindly cite:
1017
1018    "Clustering of highly homologous sequences to reduce thesize of large protein database", Weizhong Li, Lukasz Jaroszewski & Adam GodzikBioinformatics, (2001) 17:282-283
1019    "Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences", Weizhong Li & Adam Godzik Bioinformatics, (2006) 22:1658-1659
1020
1021EOD
1022  return;
1023}
1024########## END print_usage
1025
1026
1027## like above, but don't assign seqs to specific node
1028## while let nodes run them autoly
1029sub  run_batch_blast3 {
1030  my $i0 = shift;
1031  my ($id, $i, $j, $k, $cmd);
1032
1033  #### wait before qsubs
1034  if ($exec_mode eq "qsub") {
1035    while(1) {
1036      SGE_qstat_xml_query();
1037      last unless (%qsub_ids);
1038
1039      my $wait_flag = 0;
1040      foreach my $qsub_id (keys %qsub_ids) {
1041        if (defined($qstat_xml_data{$qsub_id})) { #### still running
1042          $wait_flag = 1;
1043          $cmd = `qdel -f $qsub_id`; #### at this point, all running jobs are not necessary,
1044          print LOG "force delete un necessary job $qsub_id\n";
1045        }
1046        else {
1047          delete $qsub_ids{$qsub_id};
1048        }
1049      }
1050
1051      if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); }
1052    }
1053
1054    #### delete seq files from last batch
1055    opendir(DIR1, $seq_dir);
1056    my @files = grep { /^\d/ } readdir(DIR1);
1057    closedir(DIR1);
1058    foreach $i (@files) {
1059      $cmd = `rm -f $seq_dir/$i`;
1060      print LOG "remove un necessary seq file $i\n"
1061    }
1062  }
1063
1064  my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
1065
1066  for ($k=0; $i0<$NR_no; $i0++) {
1067    $id = $NR_idx[$i0];
1068    next if ($passeds[$id]);
1069    next if ($in_bg[$id]);
1070    next if ($lens[$id] < $opt_aL_upper_band);
1071    $in_bg[$id] = 1;
1072
1073    my $seq = $seqs[$id];
1074    open(SEQ, "> $seq_dir/$id") || die "Can not write";
1075    #print SEQ "$dess[$id]\n$seq\n";
1076    print SEQ ">$id.$lens[$id]\n$seq\n";
1077    close(SEQ);
1078    $k++;
1079    last if ($k >= $total_jobs);
1080  }
1081
1082  if ($exec_mode eq "qsub") {
1083    for ($j=0; $j<$num_qsub; $j++) {
1084      my $t = "psi-cd-hit-$j";
1085      my $cmd = `qsub -N $t $remote_sh_script`;
1086      my $qsub_id = 0;
1087      if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";}
1088      print LOG "qsub querying $j, PID $qsub_id\n";
1089      $qsub_ids{$qsub_id} = 1;
1090    }
1091  }
1092  elsif ($exec_mode eq "local") {
1093    #my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`;
1094    my $cmd = `sh $remote_sh_script`;
1095  }
1096
1097  return;
1098}
1099########## END run_batch_blast3
1100
1101
1102sub write_remote_sh_script {
1103  my ($i, $j, $k);
1104  my $local_sh = <<EOD;
1105#!/bin/sh
1106#PBS -v PATH
1107#\$ -v PATH
1108EOD
1109
1110  if ($sh_file) {
1111    $local_sh = `cat $sh_file`;
1112  }
1113
1114  open(RESH, "> $remote_sh_script") || die;
1115  print RESH <<EOD;
1116$local_sh
1117
1118cd $pwd
1119EOD
1120
1121  for ($k=0; $k<$para_no; $k++){
1122    print RESH "./$remote_perl_script $k&\n"
1123  }
1124  print RESH "wait\n\n";
1125
1126  close(RESH);
1127  return;
1128}
1129########## END write_remote_sh_script
1130
1131sub write_remote_perl_script {
1132  my $dir1 = ".";
1133  my $bl2  = "$blast_exe -d $dir1/$tmp_db $bl_para";
1134     $bl2  = "$blast_exe -db $dir1/$tmp_db $bl_para" if ($bl_plus);
1135
1136  my $opti = "-i"; $opti = "-query" if ($bl_plus);
1137  my $opto = "-o"; $opto = "-out"   if ($bl_plus);
1138
1139  open(REPERL, "> $remote_perl_script") || die;
1140  print REPERL <<EOD;
1141#!/usr/bin/perl
1142\$host = shift;
1143\$arg = shift;
1144
1145#### random sleep, rand() can be a fraction of second
1146select(undef,undef,undef,rand());
1147
1148if (\$arg) {
1149  \@ids = split(/,/, \$arg);
1150}
1151else {
1152  while(1) {
1153    if (opendir(DDIR, "$seq_dir")) {
1154      \@ids = grep {/^\\d+\$/} readdir(DDIR);
1155      last;
1156    }
1157    else {
1158      sleep(1);
1159    }
1160  }
1161}
1162
1163foreach \$id (\@ids) {
1164
1165  next unless (-e "$seq_dir/\$id");
1166  next if (-e "$seq_dir/\$id.lock");
1167  \$cmd = `touch $seq_dir/\$id.lock`;
1168
1169  if ($bl_STDIN) {
1170    \$cmd = `$bl2 $opti $seq_dir/\$id | $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`;
1171  }
1172  else {
1173    \$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`;
1174    \$cmd =                         `$script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0`;
1175  }
1176  \$cmd = `rm -f  $seq_dir/\$id`;
1177  \$cmd = `rm -f  $seq_dir/\$id.lock`;
1178}
1179
1180(\$tu, \$ts, \$cu, \$cs) = times();
1181\$tt = \$tu + \$ts + \$cu + \$cs;
1182\$cmd = `echo \$tt >> $seq_dir/host.\$host.cpu`;
1183
1184EOD
1185  close(REPERL);
1186  my $cmd = `chmod 755 $remote_perl_script`;
1187
1188  return;
1189}
1190########## END write_remote_perl_script
1191
1192
1193sub wait_blast_out {
1194  my $out = shift;
1195  print LOG "waiting for $out";
1196  while(1) {
1197    if (-e $out) {
1198      my $last = `tail -1 $out`;
1199      chop($last);
1200      last if ($last =~ /^#$/);
1201    }
1202    sleep(1);
1203    print LOG ".";
1204  }
1205  print LOG "\n";
1206
1207  return;
1208}
1209########## END wait_blast_out
1210
1211
1212sub SGE_qstat_xml_query {
1213  my ($i, $j, $k, $cmd, $ll);
1214  %qstat_xml_data = (); #### global
1215  $cmd = `qstat -f -xml`;
1216  if ($cmd =~ /<queue_info/) { #### dummy
1217    $qstat_xml_data{"NULL"}= ["NULL","NULL"];
1218  }
1219  my $tmp = <<EOD;
1220<?xml version='1.0'?>
1221<job_info  xmlns:xsd="http://gridscheduler.svn.sourceforge.net/viewvc/gridscheduler/trunk/source/dist/util/resources/schemas/qstat/qstat.xsd?revision=11">
1222  <queue_info>
1223    <Queue-List>
1224      <name>all.q\@master</name>
1225      <qtype>BIP</qtype>
1226      <slots_used>0</slots_used>
1227      <slots_resv>0</slots_resv>
1228      <slots_total>0</slots_total>
1229      <load_avg>0.08000</load_avg>
1230      <arch>linux-x64</arch>
1231    </Queue-List>
1232...
1233    <Queue-List>
1234      <name>all.q\@node016</name>
1235      <qtype>BIP</qtype>
1236      <slots_used>32</slots_used>
1237      <slots_resv>0</slots_resv>
1238      <slots_total>32</slots_total>
1239      <load_avg>42.59000</load_avg>
1240      <arch>linux-x64</arch>
1241      <job_list state="running"> ####### running jobs in this section
1242        <JB_job_number>3535</JB_job_number>
1243        <JAT_prio>0.51468</JAT_prio>
1244        <JB_name>cd-hit</JB_name>
1245        <JB_owner>ubuntu</JB_owner>
1246        <state>r</state>
1247        <slots>4</slots>
1248      </job_list>
1249...
1250  </queue_info>
1251  <job_info>
1252    <job_list state="pending">  ######## pending jobs in this section
1253      <JB_job_number>3784</JB_job_number>
1254      <JAT_prio>0.60500</JAT_prio>
1255      <JB_name>cd-hit</JB_name>
1256      <JB_owner>ubuntu</JB_owner>
1257      <state>qw</state>
1258      <slots>32</slots>
1259    </job_list>
1260...
1261  </job_info>
1262</job_info>
1263
1264EOD
1265  my @lls = split(/\n/, $cmd);
1266  $i = 2; #### skip first 2 lines
1267  for (;     $i<$#lls+1; $i++) {
1268    if ($lls[$i] =~ /<job_list/) {
1269      my ($id, $name, $state);
1270      for (; $i<$#lls+1; $i++) {
1271        last if ($lls[$i] =~ /<\/job_list/);
1272        if ($lls[$i] =~ /<JB_job_number>(\d+)/) {  $id = $1;}
1273        if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;}
1274        if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;}
1275      }
1276      if (defined($id) and defined($name) and defined($state)) {
1277        $qstat_xml_data{$id} = [$name, $state];
1278      }
1279    }
1280  }
1281}
1282
1283
12841;
1285
1286