1#!/usr/bin/perl -w 2################################################################################ 3######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org 4################################################################################ 5our $pid = $$; 6our $db_in = ""; ################### 7our $db_out = ""; # input / output 8our $len_t = 10; ################### 9our $NR_clstr = 0.3; # 10our $NR_clstre = -1; #thresholds 11our $g_iden = 1; # 12our $opt_aS = 0.0; # 13our $opt_aL = 0.0; # 14our $circle = 0; # 15our $opt_g = 1; #################### 16our $blast_exe = "blastall -p blastp -m 8"; ######################### 17our $prof_exe = "blastpgp -m 8"; # 18our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500"; # 19our $prof_db = ""; # 20our $bl_para = "-F T -e 0.000001 -b 100000 -v 100000"; # program 21our $bl_STDIN = 1; # 22our $keep_bl = 0; # 23our $blast_prog= "blastp"; # 24our $formatdb = "formatdb"; ######################### 25our $exec_mode = "local"; ####################### 26our $num_qsub = 1; # 27our $para_no = 1; # compute 28our $sh_file = ""; # 29our $batch_no_per_node = 50; ####################### 30our $reformat_seg = 50000; 31our $restart_seg = 20000; 32our $job = ""; 33our $job_file = ""; 34our $date = `date`; 35our $restart_in = ""; 36our $pwd = `pwd`; chop($pwd); 37our $db_clstr; 38our $db_log; 39our $db_out1; 40our $seq_dir; 41our $bl_dir; 42our $restart_file; 43our $tmp_db; 44our $remote_perl_script; 45our $remote_sh_script; 46our $bl_path; 47our $bl_plus = 1; #### use blast+ 48our $bl_threads = 1; 49our $skip_long = 0; 50our %qsub_ids = (); #### a list of qsub ids 51our %qstat_xml_data = (); 52 53 54sub parse_para_etc { 55 my ($arg, $cmd); 56 while($arg = shift) { 57 ## input/output: 58 if ($arg eq "-i") { $db_in = shift; } 59 elsif ($arg eq "-o") { $db_out = shift; } 60 elsif ($arg eq "-l") { $len_t = shift; } 61 ## thresholds 62 elsif ($arg eq "-c") { $NR_clstr = shift; } 63 elsif ($arg eq "-ce") { $NR_clstre = shift; } 64 elsif ($arg eq "-G") { $g_iden = shift; } 65 elsif ($arg eq "-aL") { $opt_aL = shift; } 66 elsif ($arg eq "-aS") { $opt_aS = shift; } 67 elsif ($arg eq "-g") { $opt_g = shift; } 68 elsif ($arg eq "-circle") { $circle = shift; } 69 elsif ($arg eq "-sl") { $skip_long = shift; } 70 ## program 71 elsif ($arg eq "-prog") { $blast_prog= shift; } 72 elsif ($arg eq "-p") { $prof_para = shift; } 73 elsif ($arg eq "-dprof") { $prof_db = shift; die "option -dprof no longer supported!";} 74 elsif ($arg eq "-s") { $bl_para = shift; } 75 elsif ($arg eq "-k") { $keep_bl = shift; } 76 elsif ($arg eq "-bs") { $bl_STDIN = shift; } 77 ## compute 78 elsif ($arg eq "-exec") { $exec_mode = shift; } 79 elsif ($arg eq "-host") { $num_qsub = shift; } 80 elsif ($arg eq "-para") { $para_no = shift; } 81 elsif ($arg eq "-shf") { $sh_file = shift; } 82 elsif ($arg eq "-blp") { $bl_threads = shift; } 83 elsif ($arg eq "-bat") { $batch_no_per_node = shift; } 84 ## job: 85 elsif ($arg eq "-rs") { $restart_seg = shift; } 86 elsif ($arg eq "-rf") { $reformat_seg= shift; } 87 elsif ($arg eq "-restart") { $restart_in= shift; } 88 elsif ($arg eq "-J") { $job = shift; $job_file = shift; } 89 ## blast path 90 elsif ($arg eq "-P") { $bl_path = shift; } 91 else { print_usage(); exit(); } 92 } 93 94 # speical jobs 95 if ($job eq "parse_blout") { job_parse_blout(); exit();} 96 97 if ($blast_prog eq "blastn") { 98 $formatdb = "formatdb -p F"; 99 $blast_exe = "blastall -p blastn -m 8"; 100 } 101 elsif ($blast_prog eq "megablast") { 102 $blast_prog = "blastn"; #### back to blastn for blast parser type 103 $formatdb = "formatdb -p F"; 104 $blast_exe = "megablast -H 100 -D 2 -m 8"; 105 } 106 elsif ($blast_prog eq "blastpgp") { 107 $blast_exe = "blastpgp -m 8 -j 3"; 108 } 109 110 #### for blast+ 111 if ($bl_plus) { 112 $formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB"; 113 $blast_exe = "blastp -outfmt 6"; 114 $bl_para = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program 115 116 if ($blast_prog eq "blastn") { 117 $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB"; 118 $blast_exe = "blastp -task blastn -outfmt 6"; 119 $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program 120 } 121 elsif ($blast_prog eq "megablast") { 122 $blast_prog = "blastn"; #### back to blastn for blast parser type 123 $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB"; 124 $blast_exe = "blastp -task megablast -outfmt 6"; 125 $bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program 126 } 127 elsif ($blast_prog eq "blastpgp") { 128 $blast_exe = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads"; 129 } 130 } 131 132 if ($bl_path) { 133 $blast_exe = "$bl_path/$blast_exe"; 134 $formatdb = "$bl_path/$formatdb"; 135 } 136 137 (-e $db_in) || die "No input"; 138 ($db_out) || die "No output"; 139 140 $db_clstr = "$db_out.clstr"; 141 $db_log = "$db_out.log"; 142 $db_out1 = "$db_out.out"; 143 $seq_dir = "$db_in-seq"; 144 $bl_dir = "$db_in-bl"; 145 $restart_file =" $db_out.restart"; 146 147 $tmp_db = "$db_in.$pid"; 148 $remote_perl_script = "$tmp_db-bl.pl"; 149 $remote_sh_script = "$tmp_db-bl.sh"; 150 151 $cmd = `mkdir $bl_dir $seq_dir`; 152 153 write_remote_perl_script(); 154 write_remote_sh_script(); 155 return; 156} 157########## END parse_para_etc 158 159 160sub read_db { 161 my $des = ""; 162 my $seq = ""; 163 my $ll; 164 165 open(DBIN, $db_in) || die "Can not open $db_in"; 166 while($ll=<DBIN>){ 167 chop($ll); 168 if ($ll =~ /^>/) { 169 $seq =~ s/\s//g; 170 if (length($seq) > $len_t) { add_seq($des, $seq); } 171 $des = $ll; $seq = ""; 172 173 } 174 else { $seq .= $ll; } 175 } 176 $seq =~ s/\s//g; 177 if (length($seq) > $len_t) { add_seq($des, $seq); } 178 close(DBIN); 179 180 ($NR_no >=1 ) || die "No sequence readin"; 181 182 print OUTT "Total seqs $NR_no in $db_in\n"; 183 return; 184} 185########## END read_db 186 187 188sub add_seq { 189 my ($des, $seq) = @_; 190 $des =~ s/\s.+$//; 191 push(@seqs, $seq); 192 push(@dess, $des); 193 push(@lens, length($seq)); 194 push(@idens, 0); 195 push(@passeds,0); 196 push(@NR_clstr_nos,0); 197 push(@in_bg, 0); 198 $NR_no++; 199 return; 200} 201########## END add_seq 202 203 204sub open_LOG { 205 open(OUTT, ">> $db_out1") || die "can not open $db_out1"; 206 select(OUTT); $|++; ### file handle flush 207 print OUTT "Started $date"; 208 209 open(LOG, ">> $db_log") || die "Can not open $db_log"; 210 select(LOG); $|++; ### file handle flush 211 select(STDOUT); 212 return; 213} 214########## END open_LOG 215 216sub write_LOG { 217 my $txt=shift; 218 print LOG "$txt\n"; 219} 220 221{## use static variables 222my $last_NR90_no=0; 223my $last_NR_passed=0; 224sub watch_progress { 225 my ($i0, $NR90_no, $NR_passed, $NR_no, $flag) = @_; 226 my $i1 = $i0+1; 227 228 if ( $i1 % 10 == 0 ) { 229 print OUTT "."; 230 $flag = 1 if ( $i1 % 100 == 0 ); 231 } 232 233 if ($flag) { 234 my $t1 = (int($NR_passed/$NR_no*10000)) / 100; 235 my $t90 = $NR90_no - $last_NR90_no; 236 my $tno = $NR_passed - $last_NR_passed; 237 my ($tu, $ts, $cu, $cs) = times(); 238 my $tt = $tu + $ts + $cu + $cs; 239 print OUTT 240 "$i1 finished $NR90_no clusters $NR_passed passed $t90/$tno clstr/passed $t1% done $tt cpu\n"; 241 $last_NR90_no = $NR90_no; 242 $last_NR_passed = $NR_passed; 243 } 244 return; 245} 246} 247 248 249sub close_LOG { 250 my $date = `date`; print OUTT "Completed $date\n"; 251 my $total_cpu = total_remote_cpu(); 252 print OUTT "Total CPUs on remote hosts: $total_cpu\n"; 253 close(OUTT); 254 close(LOG); 255 return; 256} 257########## END close_LOG 258 259###### need to change to read dir because 260sub total_remote_cpu { 261 my ($i, $j, $k, $ll); 262 my $tt = 0; 263 for ($j=0; $j<$num_qsub; $j++) { 264 open(TCPU, "$seq_dir/host.$j.cpu") || next; 265 while($ll = <TCPU>) { 266 chop($ll); 267 $tt += $ll; 268 } 269 close(TCPU); 270 } 271 return $tt; 272} 273########## END total_remote_cpu 274 275 276sub job_parse_blout { 277 my ($i, $j, $k); 278 my @hits = process_blout_blastp_blastn($job_file); 279 280 open(BLOUT2, "> $job_file.out") || return; 281 foreach $i (@hits) { 282 print BLOUT2 join("\t", @{$i}), "\n"; 283 } 284 print BLOUT2 "#\n"; 285 close(BLOUT2); 286 return; 287} 288########## END job_parse_blout 289 290 291sub write_restart { 292 my ($i0, $i, $j, $k); 293 open(RES, "> $restart_file") || die; 294 295 for ($i0=0; $i0<$NR_no; $i0++) { 296 $i = $NR_idx[$i0]; 297 print RES "$i\t$NR_clstr_nos[$i]\t$idens[$i]\t$passeds[$i]\n"; 298 } 299 300 close(RES); 301 return; 302} 303########## END write_restart 304 305 306sub read_restart { 307 my ($ii, $i0, $i, $j, $k, $ll); 308 my @lls; 309 open(RESIN, $restart_in) || die; 310 311 $NR_passed = 0; 312 $NR90_no = 0; 313 $ii = -1; 314 $i0 = 0; 315 while($ll = <RESIN>) { 316 chop($ll); 317 @lls = split(/\t/,$ll); 318 $i = $lls[0]; 319 $NR_clstr_nos[$i] = $lls[1]; 320 $idens[$i] = $lls[2]; 321 $passeds[$i] = $lls[3]; 322 $NR_passed++ if ($lls[3]); 323 324 if ($lls[2] eq "*") { #rep 325 $NR90_no++; 326 $ii = $i0 if ($lls[3]); 327 } 328 $NR_idx[$i0] = $i; 329 $i0++; # idx of sorted , see write_restart 330 } 331 close(RESIN); 332 333 $ii++; # $ii to be last rep processed 334 return $ii; 335} 336########## END read_restart 337 338 339sub write_db_clstr { 340 my ($i0, $i, $j, $k); 341 342 my @NR90_seq = (); 343 for ($i=0; $i<$NR90_no; $i++) { $NR90_seq[$i] = []; } 344 for ($i0=0; $i0<$NR_no; $i0++) { 345 $i = $NR_idx[$i0]; 346 next unless ($passeds[$i]); 347 $j = $NR_clstr_nos[$i]; 348 next unless ($j < $NR90_no); 349 push(@{$NR90_seq[$j]}, $i); 350 } 351 352 open(DBCLS, "> $db_clstr") || die "Can not write $db_clstr"; 353 for ($i=0; $i<$NR90_no; $i++) { 354 print DBCLS ">Cluster $i\n"; 355 $k = 0; 356 foreach $j (@{ $NR90_seq[$i] }) { 357 my $des = (split(/\s+/,$dess[$j]))[0]; 358 print DBCLS "$k\t$lens[$j]"."aa, $des... "; 359 if ($idens[$j] eq "*") { print DBCLS "*\n"; } 360 else { print DBCLS "at $idens[$j]\n";} 361 $k++; 362 } 363 } 364 close(DBCLS); 365 366 @NR90_seq=(); 367 return; 368} 369########## END write_db_clstr 370 371 372sub remove_raw_blout { 373 my $NR_sofar = shift; 374 my ($i0, $i, $j, $k, $cmd); 375 return if ($keep_bl); 376 377 for ($i0=$NR_sofar; $i0>=0; $i0--) { 378 $i = $NR_idx[$i0]; 379 next unless $passeds[$i]; 380 next unless ($idens[$i] eq "*"); #only reps have blout 381 my $fout = "$bl_dir/$i"; 382 last unless (-e "$fout.out"); #removed from last call 383 if (not $bl_STDIN) { $cmd = `rm -f $fout`; } 384 $cmd = `rm -f $bl_dir/$i.out`; 385 } 386 return; 387} 388########## END remove_raw_blout 389 390 391sub remove_raw_blout_bg { 392 my $NR_sofar = shift; 393 my ($i0, $i, $j, $k, $cmd); 394 return if ($keep_bl); 395 396 my $tmp_sh_script = "$tmp_db-rm-$NR_sofar.sh"; 397 open(OUTRM, ">$tmp_sh_script") || die "can not write to $tmp_sh_script"; 398 399 for ($i0=$NR_sofar; $i0>=0; $i0--) { 400 $i = $NR_idx[$i0]; 401 next unless $passeds[$i]; 402 next unless ($idens[$i] eq "*"); #only reps have blout 403 my $fout = "$bl_dir/$i"; 404 last unless (-e "$fout.out"); #removed from last call 405 if (not $bl_STDIN) { print OUTRM "rm -f $fout\n"; } 406 print OUTRM "rm -f $bl_dir/$i.out"; 407 } 408 print OUTRM "rm -f $tmp_sh_script\n"; ## remove self 409 close(OUTRM); 410 sleep(3); 411 412 $cmd = `sh $tmp_sh_script >/dev/null 2>&1 &`; 413 return; 414} 415########## END remove_raw_blout_bg 416 417 418sub fish_other_homolog { 419 my ($i, $j, $k, $i0, $j0, $k0); 420 $id = shift; # real idx, not sorted idx 421 my @hits = (); 422 423 wait_blast_out("$bl_dir/$id.out"); 424 open(BLPOUT, "$bl_dir/$id.out") || return; 425 while($i=<BLPOUT>) { 426 last if ($i =~ /^#/); 427 chop($i); 428 push(@hits, [split(/\t/,$i)]); 429 } 430 close(BLPOUT); 431 my $rep_len = $lens[$id]; 432 433 foreach $i (@hits) { 434 my $id1 = $i->[0]; 435 next unless ($id1 < $NR_no); 436 next if ($idens[$id1] eq "*"); #existing reps 437 next if ($lens[$id1] > $rep_len); # in opt_g=1 mode, preventing it from being clustered into short rep 438 439 if ( $passeds[$id1] ) { #### if this hit is better -g 1 mode 440 my $old_e = (split(/\//,$idens[$id1]))[0]; 441 if ($i->[3] < $old_e) { 442 $idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%"; 443 $passeds[$id1] = 1; 444 $NR_clstr_nos[$id1] = $NR90_no; 445 } 446 next; 447 } 448 449 $idens[$id1] = "$i->[3]/$i->[2]aa/$i->[1]%"; 450 $passeds[$id1] = 1; 451 $NR_clstr_nos[$id1] = $NR90_no; 452 $NR_passed++; 453 } 454 return; 455} 456########## END fish_other_homolog 457 458 459########### if a hit has multiple HSPs on both + - strands 460########### keep only the HSPs, whose strand is same as the top HSP 461sub keep_strand_with_top_hsp { 462 my $self = shift; 463 my ($i,$j,$k); 464 465 my %id_2_strand = (); 466 my @new_sbj = (); 467 my $new_no = 0; 468 for ($i=0; $i<$self->{no}; $i++) { 469 my $p = $self->{sbj}->[$i]; 470 my ($id1, $len_sub) = split(/\./, $p->{id}); 471 if (not defined($id_2_strand{$id1})) { 472 $id_2_strand{$id1} = $p->{frame}; 473 } 474 if ($p->{frame} eq $id_2_strand{$id1}) { #### this stand is same as the top strand 475 push(@new_sbj, $self->{sbj}->[$i]); 476 $new_no++; 477 } 478 } 479 $self->{no} = $new_no; 480 $self->{sbj} = [@new_sbj]; 481} 482########## END keep_strand_with_top_hsp 483 484########## for blastpgp -j no (no>1) 485########## keep hits from the last round 486sub keep_hsp_of_last_round { 487 my $self = shift; 488 my ($i,$j,$k); 489 490 my @new_sbj = (); 491 my $new_no = 0; 492 my $last_score = 9999999*9999999*9999999; # a big one 493 for ($i=0; $i<$self->{no}; $i++) { 494 my $p = $self->{sbj}->[$i]; 495 my $score = $p->{score}; 496 497 if ($score > $last_score) { ## this is new round of hits 498 @new_sbj = (); 499 $new_no = 0; 500 } 501 $last_score = $score; 502 push(@new_sbj, $self->{sbj}->[$i]); 503 $new_no++; 504 } 505 $self->{no} = $new_no; 506 $self->{sbj} = [@new_sbj]; 507} 508########## END keep_hsp_of_last_round 509 510########## if a query hit a subject with multiple HSPs 511########## only the top HSP is kept 512sub keep_top_hsp { 513 my $self = shift; 514 my ($i,$j,$k); 515 516 my %id_exist = (); 517 my @new_sbj = (); 518 my $new_no = 0; 519 for ($i=0; $i<$self->{no}; $i++) { 520 my $p = $self->{sbj}->[$i]; 521 my ($id1, $len_sub) = split(/\./, $p->{id}); 522 next unless ($len_sub >0) ; 523 524 if (not defined($id_exist{$id1})) { 525 $id_exist{$id1} = 1; 526 push(@new_sbj, $self->{sbj}->[$i]); 527 $new_no++; 528 } 529 } 530 $self->{no} = $new_no; 531 $self->{sbj} = [@new_sbj]; 532} 533########## keep_top_hsp 534 535########## let the top hsp to start at 0 for both query and subject 536########## i.e. the begining of HSP to be new original - coordinate 0 537########## then reset all other HSPs' alignment coordinates 538sub reset_alignment_coor_for_circle_seq { 539 my $self = shift; 540 my ($i,$j,$k); 541 542 my $last_id = ""; 543 $j = 0; 544 my $hsp_count = 0; # number of HSPs for a subject 545 for ($i=0; $i<$self->{no}; $i++) { 546 my $p = $self->{sbj}->[$i]; 547 my ($id1, $len_sub) = split(/\./, $p->{id}); 548 549 if ($id1 ne $last_id) { 550 if ($hsp_count > 1) { # it is necessary to reset coordinate when at least 2 HSP 551 my $p_top_hsp = $self->{sbj}->[$j]; 552 my $len_q = (split(/\./, $p_top_hsp->{qid}))[1]; 553 my $len_s = (split(/\./, $p_top_hsp->{id}))[1]; 554 my $ref_q = ($p_top_hsp->{qfrom} < $p_top_hsp->{qend}) ? $p_top_hsp->{qfrom} : $p_top_hsp->{qend}; 555 my $ref_s = ($p_top_hsp->{sfrom} < $p_top_hsp->{send}) ? $p_top_hsp->{sfrom} : $p_top_hsp->{send}; 556 for ($k = $j; $k<$j+$hsp_count; $k++) { 557 $self->{sbj}->[$k]->{qfrom} -= $ref_q; if ($self->{sbj}->[$k]->{qfrom} < 0) {$self->{sbj}->[$k]->{qfrom} += $len_q;} 558 $self->{sbj}->[$k]->{qend} -= $ref_q; if ($self->{sbj}->[$k]->{qend} < 0) {$self->{sbj}->[$k]->{qend} += $len_q;} 559 $self->{sbj}->[$k]->{sfrom} -= $ref_s; if ($self->{sbj}->[$k]->{sfrom} < 0) {$self->{sbj}->[$k]->{sfrom} += $len_s;} 560 $self->{sbj}->[$k]->{send} -= $ref_s; if ($self->{sbj}->[$k]->{send} < 0) {$self->{sbj}->[$k]->{send} += $len_s;} 561 } 562 } 563 $j = $i; 564 $hsp_count = 0; 565 } 566 $last_id = $id1; 567 $hsp_count++; 568 } 569 570 #last subject 571 if ($hsp_count > 1) { # it is necessary to reset coordinate when at least 2 HSP 572 my $p_top_hsp = $self->{sbj}->[$j]; 573 my $len_q = (split(/\./, $p_top_hsp->{qid}))[1]; 574 my $len_s = (split(/\./, $p_top_hsp->{id}))[1]; 575 my $ref_q = ($p_top_hsp->{qfrom} < $p_top_hsp->{qend}) ? $p_top_hsp->{qfrom} : $p_top_hsp->{qend}; 576 my $ref_s = ($p_top_hsp->{sfrom} < $p_top_hsp->{send}) ? $p_top_hsp->{sfrom} : $p_top_hsp->{send}; 577 for ($k = $j; $k<$j+$hsp_count; $k++) { 578 $self->{sbj}->[$k]->{qfrom} -= $ref_q; if ($self->{sbj}->[$k]->{qfrom} < 0) {$self->{sbj}->[$k]->{qfrom} += $len_q;} 579 $self->{sbj}->[$k]->{qend} -= $ref_q; if ($self->{sbj}->[$k]->{qend} < 0) {$self->{sbj}->[$k]->{qend} += $len_q;} 580 $self->{sbj}->[$k]->{sfrom} -= $ref_s; if ($self->{sbj}->[$k]->{sfrom} < 0) {$self->{sbj}->[$k]->{sfrom} += $len_s;} 581 $self->{sbj}->[$k]->{send} -= $ref_s; if ($self->{sbj}->[$k]->{send} < 0) {$self->{sbj}->[$k]->{send} += $len_s;} 582 } 583 } 584 585 return; 586} 587########## reset_alignment_coor_for_circle_seq 588 589 590sub process_blout_blastp_blastn { 591 my ($i, $j, $k, $i0, $j0, $k0); 592 my $blout = shift; 593 my @blhits = (); 594 595 #### need $len_rep 596 my $len_rep = 0; 597 my $bl = readblast_m8("", $blout); 598 if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); } 599 if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); } 600 601 if ($g_iden == 0 ) { #### Local identity 602 keep_top_hsp($bl); #### local alignment, only the top HSP 603 604 for ($i=0; $i<$bl->{no}; $i++) { 605 my $p = $bl->{sbj}->[$i]; 606 my ($id1, $len_sub) = split(/\./, $p->{id}); 607 my $frame = $p->{frame}; 608 if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; } 609 my $iden = $p->{iden}; 610 next unless (($len_sub >0) and ($len_rep>0)); 611 my $cov_aS = $p->{alnln} / $len_sub; 612 my $cov_aL = $p->{alnln} / $len_rep; 613 my $exp1 = $p->{expect}; 614 615 if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) { 616 push(@blhits, [$id1, $iden, $p->{alnln}, $exp1, $frame]); 617 } 618 } 619 return @blhits; 620 } #### END if ($g_iden == 0 ) 621 else { #### Global idnetity 622 if (($blast_prog eq "blastn") and $circle) { reset_alignment_coor_for_circle_seq($bl); } 623 #### get colinear non-overlapping HSPs 624 my @hsp = (); #### [id, len, qfrom, qend, sbegin, send, expect] 625 my $iden_letters = 0; 626 my $aln_letters = 0; 627 my @aln_lens = (); 628 my $hsp_no = 0; 629 for ($i=0; $i<$bl->{no}; $i++) { 630 my $p = $bl->{sbj}->[$i]; 631 my ($id1, $len_sub) = split(/\./, $p->{id}); 632 my $frame = $p->{frame}; 633 if (not $len_rep) {$len_rep = (split(/\./,$p->{qid}))[1]; } 634 next unless (($len_sub >0) and ($len_rep>0)); 635 636 if ($hsp_no) { 637 if ($id1 ne $hsp[0]->[0]) { 638 #### 1. parse previous subject's HSPs 639 my $iden = int($iden_letters / $hsp[0]->[1] * 10000)/100; 640 my $cov_aS = $aln_letters / $hsp[0]->[1]; 641 my $cov_aL = $aln_letters / $len_rep; 642 my $exp1 = $hsp[0]->[6]; 643 my $frame = $hsp[0]->[7]; 644 645 if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) { 646 #push(@blhits, [$hsp[0]->[0], $iden, $aln_letters, $exp1, $frame]); 647 push(@blhits, [$hsp[0]->[0], $iden, join(":", @aln_lens), $exp1, $frame]); 648 } 649 #### 2. init some values 650 @hsp = (); 651 $iden_letters = 0; 652 $aln_letters = 0; 653 @aln_lens = (); 654 $hsp_no = 0; 655 } 656 } 657 658 #check whether overlap with previous high score HSPs 659 my $overlap_flag = 0; 660 for ($j=0; $j<$hsp_no; $j++) { 661 if (overlap1($p->{qfrom}, $p->{qend}, $hsp[$j]->[2], $hsp[$j]->[3])) { $overlap_flag = 1; last; } 662 if (overlap1($p->{sfrom}, $p->{send}, $hsp[$j]->[4], $hsp[$j]->[5])) { $overlap_flag = 1; last; } 663 } 664 next if ($overlap_flag); 665 666 #check whether this HSP cross with previous high score HSPs 667 my $cross_flag = 0; 668 for ($j=0; $j<$hsp_no; $j++) { 669 if (cross1($p->{qfrom}, $p->{qend}, $hsp[$j]->[2], $hsp[$j]->[3], 670 $p->{sfrom}, $p->{send}, $hsp[$j]->[4], $hsp[$j]->[5])) { 671 $cross_flag = 1; last; 672 } 673 } 674 next if ($cross_flag); 675 676 push(@hsp, [$id1, $len_sub, $p->{qfrom}, $p->{qend}, $p->{sfrom}, $p->{send}, $p->{expect}, $p->{frame}]); 677 $iden_letters += int($p->{iden} * $p->{alnln} / 100); 678 $aln_letters += $p->{alnln}; 679 push(@aln_lens, $p->{alnln}); 680 $hsp_no++; 681 } 682 683 if ($hsp_no) { #last record 684 #### 1. parse previous subject's HSPs 685 my $iden = int($iden_letters / $hsp[0]->[1] * 10000)/100; 686 my $cov_aS = $aln_letters / $hsp[0]->[1]; 687 my $cov_aL = $aln_letters / $len_rep; 688 my $exp1 = $hsp[0]->[6]; 689 my $frame = $hsp[0]->[7]; 690 691 if (($iden/100 > $NR_clstr or $exp1<$NR_clstre) and ($cov_aS >= $opt_aS) and ($cov_aL >= $opt_aL) ) { 692 #push(@blhits, [$hsp[0]->[0], $iden, $aln_letters, $exp1, $frame]); 693 push(@blhits, [$hsp[0]->[0], $iden, join(":", @aln_lens), $exp1, $frame]); 694 } 695 } 696 697 return @blhits; 698 } 699} 700########## END process_blout_blastp_blastn 701 702 703sub overlap1 { 704 my ($b1, $e1, $b2, $e2) = @_; 705 706 my $t; ### 707 if ($e1 < $b1) { $t = $e1; $e1 = $b1; $b1 = $t; } 708 if ($e2 < $b2) { $t = $e2; $e2 = $b2; $b2 = $t; } 709 710 return 0 if ($e2 < $b1); 711 return 0 if ($b2 > $e1); 712 return ( ($e1<$e2)? $e1:$e2 )-( ($b1>$b2)? $b1:$b2); 713} 714########## END overlap1 715 716## modified on 2013_0818 to hancle +- frames 717sub cross1 { 718 my ($q_b1, $q_e1, $q_b2, $q_e2, 719 $s_b1, $s_e1, $s_b2, $s_e2) = @_; 720 721 my $fr_q1 = ($q_b1 < $q_e1) ? 1 : -1; 722 my $fr_q2 = ($q_b2 < $q_e2) ? 1 : -1; 723 my $fr_s1 = ($s_b1 < $s_e1) ? 1 : -1; 724 my $fr_s2 = ($s_b2 < $s_e2) ? 1 : -1; 725 726 my $fr1 = $fr_q1 * $fr_s1; 727 my $fr2 = $fr_q2 * $fr_s2; 728 return 1 if (($fr1 * $fr2) < 0); # one ++ and one +- 729 730 my $t; 731 if ($q_e1 < $q_b1) { $t = $q_e1; $q_e1 = $q_b1; $q_b1 = $t; } 732 if ($q_e2 < $q_b2) { $t = $q_e2; $q_e2 = $q_b2; $q_b2 = $t; } 733 if ($s_e1 < $s_b1) { $t = $s_e1; $s_e1 = $s_b1; $s_b1 = $t; } 734 if ($s_e2 < $s_b2) { $t = $s_e2; $s_e2 = $s_b2; $s_b2 = $t; } 735 736# after above transformation 737# 0 q_b1 q_e1 q_b2 q_e2 qlen 738# query 5' ==================================================================== 739# match |||||||||||||||| ||||||||||||| 740# subject 5' ========================================================================>>>>>> frame + 741# 0 s_b1 s_e1 s_b2 s_e2 slen 742 743# match |||||||||||||||| ||||||||||||| 744# subject 3' ========================================================================>>>>>> frame - 745# slen s_e1 s_b1 s_e2 s_b2 0 746 747 if (($fr1 > 0) and ($fr2>0)) { # both ++ 748 return ( (($q_b2-$q_b1)*($s_b2-$s_b1) <0) ? 1 : 0); 749 } 750 else { # both -- 751 return ( (($q_b2-$q_b1)*($s_e1-$s_e2) <0) ? 1 : 0); 752 } 753 754} 755########## END cross1 756 757 758## modified on 2013_0818 to hancle +- frames 759sub cross1_before_2013_0818 { 760 my ($q_b1, $q_e1, $q_b2, $q_e2, 761 $s_b1, $s_e1, $s_b2, $s_e2) = @_; 762 763 my $t; 764 if ($q_e1 < $q_b1) { $t = $q_e1; $q_e1 = $q_b1; $q_b1 = $t; } 765 if ($q_e2 < $q_b2) { $t = $q_e2; $q_e2 = $q_b2; $q_b2 = $t; } 766 if ($s_e1 < $s_b1) { $t = $s_e1; $s_e1 = $s_b1; $s_b1 = $t; } 767 if ($s_e2 < $s_b2) { $t = $s_e2; $s_e2 = $s_b2; $s_b2 = $t; } 768 769 return ( (($q_b2-$q_b1)*($s_b2-$s_b1) <0) ? 1 : 0); 770} 771########## END cross1 772 773sub readblast_m8 { 774 my ($i, $j, $k, $ll, $no); 775 my ($q_seq, $filename) = @_; 776 777 778 my $fh = "BL" ; 779 if ($bl_STDIN) { $fh = "STDIN"; } 780 else { open($fh, $filename) || return; } 781 782 my @this_sbj = (); 783 $no = 0; 784 while($ll = <$fh>) { 785 chop($ll); 786 my @lls = split(/\t/,$ll); 787 my $frame = ""; 788 $frame .= ($lls[6] < $lls[7]) ? "+" : "-"; 789 $frame .= ($lls[8] < $lls[9]) ? "+" : "-"; 790 next unless ($lls[0] and $lls[1]); 791 $this_sbj[$no] = { 792 'qid' => $lls[0], 793 'id' => $lls[1], 794 'iden' => $lls[2], 795 'alnln' => $lls[3], 796 'ms' => $lls[4], 797 'gap' => $lls[5], 798 'qfrom' => $lls[6], 799 'qend' => $lls[7], 800 'sfrom' => $lls[8], 801 'send' => $lls[9], 802 'expect' => $lls[10], 803 'score' => $lls[11], 804 'frame' => $frame, 805 }; 806 807 $no++; 808# BLASTP 2.2.24 [Aug-08-2010] 809# Query: gi|388328107|pdb|4DDG|A Chain A, Crystal Structure Of Human Otub1UBCH5B~UBUB 810# Database: pdbaa.fa 811# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score 812#gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 91.81 171 9 3 6 171 1 171 6e-89 323 813#gi|388328107|pdb|4DDG|A gi|388328107|pdb|4DDG|A 96.51 86 3 0 235 320 155 240 2e-41 166 814 } 815 close($fh) if (not $bl_STDIN); 816 817 my $self = { 818 'no' => $no, 819 'sbj' => [@this_sbj], 820 }; 821 return $self; 822} 823########## END readblast_m8 824 825 826sub blast_formatdb { 827 my ($i0, $i, $j, $k, $len1); 828 829 open(FDB, "> $tmp_db") || die; 830 $j = 0; 831 $len1 = 0; 832 for ($i0=$NR_no-1; $i0>=0; $i0--) { ### from shortest to longest 833 $i = $NR_idx[$i0]; 834 last if ($idens[$i] eq "*"); ### last if reach rep 835 next if ($lens[$i] < $opt_aL_lower_band); 836 next if ($passeds[$i] and ($opt_g==0)); 837 my $seq = $seqs[$i]; 838 $seq =~ s/(.{70})/$1\n/g; 839 $seq =~ s/\n$//; 840 #print FDB ">$i $dess[$i]\n$seq\n"; 841 print FDB ">$i.$lens[$i]\n$seq\n"; 842 $j++; 843 $len1 += $lens[$i]; 844 } 845 close(FDB); 846 847 while(1) { 848 opendir(SEQDB, $seq_dir) || next; 849 my @leftseqs = grep {/lock/} readdir(SEQDB); 850 closedir(SEQDB); 851 852 last unless @leftseqs; 853 sleep(3); 854 } 855 856 return(0, 0) unless ($j > 0); 857 858 my $cmd_line = "$formatdb -i $tmp_db"; 859 $cmd_line = "$formatdb -in $tmp_db" if ($bl_plus); 860 my $cmd = `$cmd_line`; 861 862 ((-e "$tmp_db.phr") and (-e "$tmp_db.pin") and (-e "$tmp_db.psq")) || 863 ((-e "$tmp_db.nhr") and (-e "$tmp_db.nin") and (-e "$tmp_db.nsq")) || 864 ((-e "$tmp_db.00.phr") and (-e "$tmp_db.00.pin") and (-e "$tmp_db.00.psq")) || 865 ((-e "$tmp_db.00.nhr") and (-e "$tmp_db.00.nin") and (-e "$tmp_db.00.nsq")) 866 || die "Can not formatdb"; 867 868 return($j, $len1); 869} 870########## END blast_formatdb 871 872 873sub remove_blast_db { 874 my ($i, $j, $k); 875 $cmd = `rm -f $tmp_db`; 876 $cmd = `rm -f $tmp_db.p*`; 877 $cmd = `rm -f $tmp_db.n*`; 878 879 return; 880} 881########## END remove_blast_db 882 883 884my $common_usage = <<EOD; 885 886Options 887 input/output: 888 -i in_dbname, required 889 -o out_dbname, required 890 -l length_of_throw_away_sequences, default 10 891 892 thresholds: 893 -c clustering threshold (sequence identity), default 0.3 894 -ce clustering threshold (blast expect), default -1, 895 it means by default it doesn't use expect threshold, 896 but with positive value, the program cluster seqs if similarities 897 meet either identity threshold or expect threshold 898 -G (1/0) use global identity? default 1 899 two sequences Long (i.e. representative) and Short (redunant) may have multiple 900 alignment fragments (i.e. HSPs), see: 901 seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Long sequence 902 |||||||||||||||||| ///////////// i.e. representative 903 |||||||||||||||||| ///////////// sequence 904 ||||||||HSP 1 |||| ////HSP 2 /// 905 |||||||||||||||||| ///////////// 906 |||||||||||||||||| ///////////// 907 seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Short sequence 908 << length 1 >> << len 2 >> i.e. redundant 909 <<<<<<<<<<<< length of short sequence >>>>>>>>>>>>>> sequence 910 911 total identical letters from all co-linear and non-overlapping HSPs 912 Glogal identity = ------------------------------------------------------------------- 913 length of short sequence 914 Local identity = identity of the top high score HSP 915 if you prefer to use -G 0, it is suggested that you also 916 use -aS, -aL, such as -aS 0.8, to prevent very short matches. 917 -aL alignment coverage for the longer sequence, default 0.0 918 if set to 0.9, the alignment must covers 90% of the sequence 919 -aS alignment coverage for the shorter sequence, default 0.0 920 if set to 0.9, the alignment must covers 90% of the sequence 921 -g (1/0), default 0 922 by cd-hit's default algorithm, a sequence is clustered to the first 923 cluster that meet the threshold (fast cluster). If set to 1, the program 924 will cluster it into the most similar cluster that meet the threshold 925 (accurate but slow mode) 926 but either 1 or 0 won't change the representatives of final clusters 927 -circle (1/0), default 0 928 when set to 1, treat sequences as circular sequence. 929 bacterial genomes, plasmids are circular, but their genome coordinate maybe arbitary, 930 the 2 HSPs below will be treated as non co-linear with -circle 0 931 the 2 HSPs below will be treated as co-linear with -circle 1 932 -------------circle----------- 933 | | 934 seq1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 1 935 \\\\\\\\ ///////////// 936 \\\\\\\\ ///////////// 937 HSP 2 -> ////HSP 1 /// <-HSP 2 938 ///////////// \\\\\\\\ 939 ///////////// \\\\\\\\ 940 seq2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx genome / plasmid 2 941 | | 942 -----------circle-------------- 943 -sl, length of very long sequences to be skipped, default 0, no skipping 944 e.g. -sl 5000 means sequences longer than 5000 aa will be treated as singleton clusters 945 without clustering, to save time, especially when there is -aL option in place, very 946 long sequences will not be clustered anyway. 947 program: 948 -prog (blastp, blastn, megablast, blastpgp), default blastp 949 -p profile search para, default 950 "-j 3 -F F -e 0.001 -b 500 -v 500" 951 -dprof database for building PSSM, default using input 952 you can also use another database that is more comprehensive like NR80 953 -s blast search para, default 954 "-F F -e 0.000001 -b 100000 -v 100000" 955 -bs (1/0) default 1 956 pipe blast results from into parser instead of save in hard drive (save time) 957 958 compute: 959 -exec (qsub, local) default local 960 this program writes a shell script to run blast, this script is 961 either performed locally by sh or remotely by qsub 962 with qsub, you can use PBS, SGE etc 963 -host number of hosts, ie number of qsub jobs 964 -para number of parallel blast job per qsub job (each blast can use multi cores), default 1 965 -blp number of threads per blast job, default 1 966 number of threads per blast job X number of parallel blast job per qsub job 967 should <= the number of cores in your computer 968 if your computer grid has 32 cores / node, do either of the followings 969 -para 4 -blp 8 970 -para 8 -blp 4 971 -para 16 -blp 2 972 -para 32 -blp 1 973 -bat number of sequences a blast job to process 974 -shf a filename for add local settings into the job shell script 975 for example, when you run PBS jobs, you can add quene name etc in this 976 file and this script will add them into the job shell script 977e.g. template file for PBS 978#!/bin/sh 979#PBS -v PATH 980#PBS -l walltime=8:00:00 981#PBS -q job_queue.q 982 983e.g. template file for SGE or OGE 984#!/bin/sh 985#\$ -v PATH 986#\$ -q job_queue.q 987#\$ -V 988#\$ -pe orte 8 989 990 job: 991 -rs steps of save restart file and clustering output, default 5000 992 everytime after process 5000 sequences, program write a 993 restart file and current clustering information 994 -restart restart file, readin a restart file 995 if program crash, stoped, termitated, you can restart it by 996 add a option "-restart sth.restart" 997 -rf steps of re format blast database, default 200,000 998 if program clustered 200,000 seqs, it remove them from seq 999 pool, and re format blast db to save time 1000 -J job, job_file, exe specific jobs like parse blast outonly 1001 DO NOT use it, it is only used by this program itself 1002 -k (1/0) keep blast raw output file, default $keep_bl 1003 1004 -P path to executables 1005EOD 1006 1007 1008sub print_usage { 1009 print <<EOD; 1010Usage psi-cd-hit [Options] 1011$common_usage 1012 1013 ============================== 1014 by Weizhong Li, liwz\@sdsc.edu 1015 ============================== 1016 If you find cd-hit useful, please kindly cite: 1017 1018 "Clustering of highly homologous sequences to reduce thesize of large protein database", Weizhong Li, Lukasz Jaroszewski & Adam GodzikBioinformatics, (2001) 17:282-283 1019 "Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences", Weizhong Li & Adam Godzik Bioinformatics, (2006) 22:1658-1659 1020 1021EOD 1022 return; 1023} 1024########## END print_usage 1025 1026 1027## like above, but don't assign seqs to specific node 1028## while let nodes run them autoly 1029sub run_batch_blast3 { 1030 my $i0 = shift; 1031 my ($id, $i, $j, $k, $cmd); 1032 1033 #### wait before qsubs 1034 if ($exec_mode eq "qsub") { 1035 while(1) { 1036 SGE_qstat_xml_query(); 1037 last unless (%qsub_ids); 1038 1039 my $wait_flag = 0; 1040 foreach my $qsub_id (keys %qsub_ids) { 1041 if (defined($qstat_xml_data{$qsub_id})) { #### still running 1042 $wait_flag = 1; 1043 $cmd = `qdel -f $qsub_id`; #### at this point, all running jobs are not necessary, 1044 print LOG "force delete un necessary job $qsub_id\n"; 1045 } 1046 else { 1047 delete $qsub_ids{$qsub_id}; 1048 } 1049 } 1050 1051 if ($wait_flag) {print LOG "wait submitted jobs\n"; sleep(1); } 1052 } 1053 1054 #### delete seq files from last batch 1055 opendir(DIR1, $seq_dir); 1056 my @files = grep { /^\d/ } readdir(DIR1); 1057 closedir(DIR1); 1058 foreach $i (@files) { 1059 $cmd = `rm -f $seq_dir/$i`; 1060 print LOG "remove un necessary seq file $i\n" 1061 } 1062 } 1063 1064 my $total_jobs = $batch_no_per_node * $num_qsub * $para_no; 1065 1066 for ($k=0; $i0<$NR_no; $i0++) { 1067 $id = $NR_idx[$i0]; 1068 next if ($passeds[$id]); 1069 next if ($in_bg[$id]); 1070 next if ($lens[$id] < $opt_aL_upper_band); 1071 $in_bg[$id] = 1; 1072 1073 my $seq = $seqs[$id]; 1074 open(SEQ, "> $seq_dir/$id") || die "Can not write"; 1075 #print SEQ "$dess[$id]\n$seq\n"; 1076 print SEQ ">$id.$lens[$id]\n$seq\n"; 1077 close(SEQ); 1078 $k++; 1079 last if ($k >= $total_jobs); 1080 } 1081 1082 if ($exec_mode eq "qsub") { 1083 for ($j=0; $j<$num_qsub; $j++) { 1084 my $t = "psi-cd-hit-$j"; 1085 my $cmd = `qsub -N $t $remote_sh_script`; 1086 my $qsub_id = 0; 1087 if ($cmd =~ /(\d+)/) { $qsub_id = $1;} else {die "can not submit qsub job and return a id\n";} 1088 print LOG "qsub querying $j, PID $qsub_id\n"; 1089 $qsub_ids{$qsub_id} = 1; 1090 } 1091 } 1092 elsif ($exec_mode eq "local") { 1093 #my $cmd = `sh $remote_sh_script >/dev/null 2>&1 &`; 1094 my $cmd = `sh $remote_sh_script`; 1095 } 1096 1097 return; 1098} 1099########## END run_batch_blast3 1100 1101 1102sub write_remote_sh_script { 1103 my ($i, $j, $k); 1104 my $local_sh = <<EOD; 1105#!/bin/sh 1106#PBS -v PATH 1107#\$ -v PATH 1108EOD 1109 1110 if ($sh_file) { 1111 $local_sh = `cat $sh_file`; 1112 } 1113 1114 open(RESH, "> $remote_sh_script") || die; 1115 print RESH <<EOD; 1116$local_sh 1117 1118cd $pwd 1119EOD 1120 1121 for ($k=0; $k<$para_no; $k++){ 1122 print RESH "./$remote_perl_script $k&\n" 1123 } 1124 print RESH "wait\n\n"; 1125 1126 close(RESH); 1127 return; 1128} 1129########## END write_remote_sh_script 1130 1131sub write_remote_perl_script { 1132 my $dir1 = "."; 1133 my $bl2 = "$blast_exe -d $dir1/$tmp_db $bl_para"; 1134 $bl2 = "$blast_exe -db $dir1/$tmp_db $bl_para" if ($bl_plus); 1135 1136 my $opti = "-i"; $opti = "-query" if ($bl_plus); 1137 my $opto = "-o"; $opto = "-out" if ($bl_plus); 1138 1139 open(REPERL, "> $remote_perl_script") || die; 1140 print REPERL <<EOD; 1141#!/usr/bin/perl 1142\$host = shift; 1143\$arg = shift; 1144 1145#### random sleep, rand() can be a fraction of second 1146select(undef,undef,undef,rand()); 1147 1148if (\$arg) { 1149 \@ids = split(/,/, \$arg); 1150} 1151else { 1152 while(1) { 1153 if (opendir(DDIR, "$seq_dir")) { 1154 \@ids = grep {/^\\d+\$/} readdir(DDIR); 1155 last; 1156 } 1157 else { 1158 sleep(1); 1159 } 1160 } 1161} 1162 1163foreach \$id (\@ids) { 1164 1165 next unless (-e "$seq_dir/\$id"); 1166 next if (-e "$seq_dir/\$id.lock"); 1167 \$cmd = `touch $seq_dir/\$id.lock`; 1168 1169 if ($bl_STDIN) { 1170 \$cmd = `$bl2 $opti $seq_dir/\$id | $script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 1`; 1171 } 1172 else { 1173 \$cmd = `$bl2 $opti $seq_dir/\$id $opto $bl_dir/\$id`; 1174 \$cmd = `$script_name -J parse_blout $bl_dir/\$id -c $NR_clstr -ce $NR_clstre -aS $opt_aS -aL $opt_aL -G $g_iden -prog $blast_prog -bs 0`; 1175 } 1176 \$cmd = `rm -f $seq_dir/\$id`; 1177 \$cmd = `rm -f $seq_dir/\$id.lock`; 1178} 1179 1180(\$tu, \$ts, \$cu, \$cs) = times(); 1181\$tt = \$tu + \$ts + \$cu + \$cs; 1182\$cmd = `echo \$tt >> $seq_dir/host.\$host.cpu`; 1183 1184EOD 1185 close(REPERL); 1186 my $cmd = `chmod 755 $remote_perl_script`; 1187 1188 return; 1189} 1190########## END write_remote_perl_script 1191 1192 1193sub wait_blast_out { 1194 my $out = shift; 1195 print LOG "waiting for $out"; 1196 while(1) { 1197 if (-e $out) { 1198 my $last = `tail -1 $out`; 1199 chop($last); 1200 last if ($last =~ /^#$/); 1201 } 1202 sleep(1); 1203 print LOG "."; 1204 } 1205 print LOG "\n"; 1206 1207 return; 1208} 1209########## END wait_blast_out 1210 1211 1212sub SGE_qstat_xml_query { 1213 my ($i, $j, $k, $cmd, $ll); 1214 %qstat_xml_data = (); #### global 1215 $cmd = `qstat -f -xml`; 1216 if ($cmd =~ /<queue_info/) { #### dummy 1217 $qstat_xml_data{"NULL"}= ["NULL","NULL"]; 1218 } 1219 my $tmp = <<EOD; 1220<?xml version='1.0'?> 1221<job_info xmlns:xsd="http://gridscheduler.svn.sourceforge.net/viewvc/gridscheduler/trunk/source/dist/util/resources/schemas/qstat/qstat.xsd?revision=11"> 1222 <queue_info> 1223 <Queue-List> 1224 <name>all.q\@master</name> 1225 <qtype>BIP</qtype> 1226 <slots_used>0</slots_used> 1227 <slots_resv>0</slots_resv> 1228 <slots_total>0</slots_total> 1229 <load_avg>0.08000</load_avg> 1230 <arch>linux-x64</arch> 1231 </Queue-List> 1232... 1233 <Queue-List> 1234 <name>all.q\@node016</name> 1235 <qtype>BIP</qtype> 1236 <slots_used>32</slots_used> 1237 <slots_resv>0</slots_resv> 1238 <slots_total>32</slots_total> 1239 <load_avg>42.59000</load_avg> 1240 <arch>linux-x64</arch> 1241 <job_list state="running"> ####### running jobs in this section 1242 <JB_job_number>3535</JB_job_number> 1243 <JAT_prio>0.51468</JAT_prio> 1244 <JB_name>cd-hit</JB_name> 1245 <JB_owner>ubuntu</JB_owner> 1246 <state>r</state> 1247 <slots>4</slots> 1248 </job_list> 1249... 1250 </queue_info> 1251 <job_info> 1252 <job_list state="pending"> ######## pending jobs in this section 1253 <JB_job_number>3784</JB_job_number> 1254 <JAT_prio>0.60500</JAT_prio> 1255 <JB_name>cd-hit</JB_name> 1256 <JB_owner>ubuntu</JB_owner> 1257 <state>qw</state> 1258 <slots>32</slots> 1259 </job_list> 1260... 1261 </job_info> 1262</job_info> 1263 1264EOD 1265 my @lls = split(/\n/, $cmd); 1266 $i = 2; #### skip first 2 lines 1267 for (; $i<$#lls+1; $i++) { 1268 if ($lls[$i] =~ /<job_list/) { 1269 my ($id, $name, $state); 1270 for (; $i<$#lls+1; $i++) { 1271 last if ($lls[$i] =~ /<\/job_list/); 1272 if ($lls[$i] =~ /<JB_job_number>(\d+)/) { $id = $1;} 1273 if ($lls[$i] =~ /<JB_name>([^<]+)/) { $name = $1;} 1274 if ($lls[$i] =~ /<state>([^<]+)/) {$state = $1;} 1275 } 1276 if (defined($id) and defined($name) and defined($state)) { 1277 $qstat_xml_data{$id} = [$name, $state]; 1278 } 1279 } 1280 } 1281} 1282 1283 12841; 1285 1286