1# tRNAscanSE/Sequence.pm 2# This class describes a sequence and provides functions for handling fasta files in tRNAscan-SE. 3# 4# -------------------------------------------------------------- 5# This module is part of the tRNAscan-SE program. 6# Copyright (C) 2017 Patricia Chan and Todd Lowe 7# -------------------------------------------------------------- 8# 9# Perl code for reading FASTA-formatted sequence files 10# SRE, Sat Feb 19 19:10:43 1994 11 12# These subroutines read a FASTA formatted file one sequence at a time. 13# Open(filename, open_mode) opens a file for reading or wrting. 14# Close() closes it when you're done. 15# 16# read_fasta() returns 1 on success and 0 on failure (end of file). 17# When it returns success, the following variables are set: 18# 19# $seq_name = name of sequence (1st word on FASTA title line) 20# $seq_description = description (remainder of FASTA title line) 21# $seq_length = length of sequence 22# $sequence = sequence, gaps and newlines removed 23# 24# Modified by TMJL 11/95 for use in tRNAscan-SE 25 26package tRNAscanSE::Sequence; 27 28use strict; 29use tRNAscanSE::Utils; 30use tRNAscanSE::Configuration; 31use tRNAscanSE::Options; 32use tRNAscanSE::tRNA; 33use tRNAscanSE::ArraytRNA; 34use tRNAscanSE::LogFile; 35 36sub new 37{ 38 my $class = shift; 39 my $self = {}; 40 41 initialize($self); 42 43 bless ($self, $class); 44 return $self; 45} 46 47sub DESTROY 48{ 49 my $self = shift; 50} 51 52sub initialize 53{ 54 my $self = shift; 55 $self->{file_name} = ""; # name of log file 56 $self->{FILE_H} = undef; # file handle 57 58 $self->{max_seq_buffer} = 50000000; # Max size of seq buffer read in at once 59 $self->{seq_buf_overlap} = 200; # Nucleotides of overlap between buffers 60 $self->{seq_index_inc} = 100000; 61 62 $self->{saved_line} = ""; 63 $self->{buffer_overlap_seq} = ""; 64 $self->{buffer_end_index} = 0; 65 $self->{seq_buf_overrun} = 0; 66 $self->{buffer_length} = 0; 67 $self->{key_found} = 0; 68 $self->{all_seq_indices} = +[]; # Keeps track of indexing into seqs for fast retreival 69 70 $self->{seq_id} = 0; 71 $self->{seq_name} = ""; 72 $self->{seq_description} = ""; 73 $self->{seq_length} = 0; 74 $self->{sequence} = undef; 75 76 $self->{seq_name_map} = {}; 77} 78 79sub file_name 80{ 81 my $self = shift; 82 if (@_) { $self->{file_name} = shift; } 83 return $self->{file_name}; 84} 85 86sub key_found 87{ 88 my $self = shift; 89 if (@_) { $self->{key_found} = shift; } 90 return $self->{key_found}; 91} 92 93sub seq_id 94{ 95 my $self = shift; 96 if (@_) { $self->{seq_id} = shift; } 97 return $self->{seq_id}; 98} 99 100sub seq_name 101{ 102 my $self = shift; 103 if (@_) { $self->{seq_name} = shift; } 104 return $self->{seq_name}; 105} 106 107sub get_seq_id_from_name 108{ 109 my $self = shift; 110 my $name = shift; 111 my $id = -1; 112 if (defined $self->{seq_name_map}->{$name}) 113 { 114 $id = $self->{seq_name_map}->{$name}; 115 } 116 return $id; 117} 118 119sub seq_description 120{ 121 my $self = shift; 122 if (@_) { $self->{seq_description} = shift; } 123 return $self->{seq_description}; 124} 125 126sub seq_length 127{ 128 my $self = shift; 129 if (@_) { $self->{seq_length} = shift; } 130 return $self->{seq_length}; 131} 132 133sub sequence 134{ 135 my $self = shift; 136 if (@_) { $self->{sequence} = shift; } 137 return $self->{sequence}; 138} 139 140sub release_memory 141{ 142 my $self = shift; 143 undef($self->{sequence}); 144} 145 146sub set_seq_info 147{ 148 my $self = shift; 149 $self->{seq_name} = shift; 150 $self->{seq_description} = shift; 151 $self->{seq_length} = shift; 152 $self->{sequence} = shift; 153} 154 155sub reset_buffer_ct 156{ 157 my $self = shift; 158 $self->{buffer_overlap_seq} = ""; 159 $self->{buffer_end_index} = 0; 160 $self->{seq_buf_overrun} = 0; 161} 162 163sub buffer_end_index 164{ 165 my $self = shift; 166 if (@_) { $self->{buffer_end_index} = shift; } 167 return $self->{buffer_end_index}; 168} 169 170sub seq_buf_overrun 171{ 172 my $self = shift; 173 if (@_) { $self->{seq_buf_overrun} = shift; } 174 return $self->{seq_buf_overrun}; 175} 176 177sub seekpos 178{ 179 my $self = shift; 180 my $pos = shift; 181 182 seek($self->{FILE_H}, $pos, 0); 183} 184 185sub open_file 186{ 187 my $self = shift; 188 my $file = shift; 189 my $mode = shift; 190 191 my $success = 0; 192 193 if ($mode eq "read") 194 { 195 &open_for_read(\$self->{FILE_H}, $file); 196 $self->{seq_id} = 0; 197 $self->{saved_line} = ""; 198 } 199 elsif ($mode eq "write") 200 { 201 &open_for_write(\$self->{FILE_H}, $file); 202 } 203 elsif ($mode eq "append") 204 { 205 &open_for_append(\$self->{FILE_H}, $file); 206 } 207 $self->{file_name} = $file; 208 $success = 1; 209 210 return $success; 211} 212 213sub close_file 214{ 215 my $self = shift; 216 217 if (defined $self->{FILE_H}) 218 { 219 close($self->{FILE_H}); 220 } 221} 222 223# Reads length of sequence first, then pre-extends to total length 224# before reading it in (important optimization for very long sequences) 225# Also, will search for sequence name matching $key 226sub read_fasta 227{ 228 my $self = shift; 229 my $opts = shift; 230 my $target_seq_id = shift; 231 my $key = $opts->seq_key(); 232 my $fh = $self->{FILE_H}; 233 234 my ($seqlen, $filepos, $pre_extend_len, $seq_index_step, @seq_index); 235 236 $self->{seq_name} = ""; 237 $self->{seq_description} = ""; 238 $self->{seq_length} = 0; 239 $self->{sequence} = ""; 240 241# if $key is not the global $seq_key (non-alphanumerics already 242# escaped out for $seq_key) then escape out '\' problem causing char's 243# if ($key ne $seq_key) { 244# $key =~ s/(\W)/\\$1/g; 245# } 246 247 while ((!eof($fh)) && (($self->{saved_line} =~ /^>/) || ($self->{saved_line} = <$fh>))) 248 { 249 if (($self->{saved_line} =~ /^>\s*($key)\s+(.*)$/) || 250 ($opts->start_at_key()) && ($self->{key_found}) && 251 ($self->{saved_line} =~ /^>\s*(\S*)\s+(.*)$/o)) 252 { 253 $self->{seq_id}++; 254 255 # if searching for a particular SeqID go on to next seq 256 # if target and current seqid's don't match 257 if ($target_seq_id && ($self->{seq_id} != $target_seq_id)) 258 { 259 $self->{saved_line} = <$fh>; 260 next; 261 } 262 263 $self->{key_found} = 1; 264 $self->{seq_name} = $1; 265 $self->{seq_description} = $2; 266 $self->{sequence} = ""; 267 $self->{seq_name_map}->{$self->{seq_name}} = $self->{seq_id}; 268 269 @seq_index = (); 270 $seq_index_step = $self->{seq_index_inc}; # set first bp position to save 271 272 $filepos = tell($fh); 273 $seqlen = 0; 274 push(@seq_index, $seqlen, tell($fh)); 275 $pre_extend_len = 0; 276# print LOGFILE "At pos: "; 277 278 while ($self->{saved_line} = <$fh>) 279 { 280 if ($self->{saved_line} =~ /^>/) { last; } 281 $self->{saved_line} =~ s/[ \n\t\d]//g; # strip whitespace & numbers 282 $seqlen += length($self->{saved_line}); 283 284 # Save the start position of this chunk of seq for later easy return 285 if ($seqlen > $seq_index_step) 286 { 287 push(@seq_index, $seqlen, tell($fh)); 288 $seq_index_step += $self->{seq_index_inc}; 289# print LOGFILE "($Seqlen) "; 290 } 291 292 if (($pre_extend_len == 0) && ($seqlen >= $self->{max_seq_buffer})) 293 { 294 $pre_extend_len = $seqlen; 295 } 296 } 297 push(@seq_index, $seqlen, tell($fh)); 298 $self->{seq_length} = $seqlen; 299# print LOGFILE " "; 300 301 $self->{all_seq_indices}->[$self->{seq_id}] = [@seq_index]; 302 303 seek($fh,$filepos,0); 304 $self->{sequence} = 'X' x $pre_extend_len; # pre-extending string for efficiency 305 $seqlen = 0; 306 while (($seqlen < $self->{max_seq_buffer}) && ($self->{saved_line} = <$fh>)) 307 { 308 if ($self->{saved_line} =~ /^>/) { last; } 309 $self->{saved_line} =~ s/[ \n\t\d]//g; # strip whitespace & numbers 310 substr($self->{sequence}, $seqlen, length($self->{saved_line})) = $self->{saved_line}; 311 $seqlen += length($self->{saved_line}); 312 } 313 314 # if sequence is longer than MaxSeqBuffer length, 315 # then save last ~200 nt to allow overlap with next buffer frame 316 # this prevents tRNAs on the border between buffers from being chopped 317 # in half (and missed!) 318 319 if ($seqlen >= $self->{max_seq_buffer}) 320 { 321 $self->{buffer_overlap_seq} = substr($self->{sequence}, $seqlen - $self->{seq_buf_overlap}); 322 $self->{buffer_end_index} = $seqlen - length($self->{buffer_overlap_seq}); 323 $self->{seq_buf_overrun} = 1; 324 } 325 else 326 { 327 $self->{seq_buf_overrun} = 0; 328 } 329 330 $self->{buffer_length} = length($self->{sequence}); 331 $self->{sequence} = uc($self->{sequence}); 332 $self->{sequence} =~ s/U/T/g; 333 $self->{sequence} =~ s/X/N/g; 334 335 ## Remove long runs of N's from consideration by pre-scanners 336 ## By doing this, pre-scanner false-pos rate is normal, even 337 ## when scanning unfinished genomes with long N insert "placeholders" 338 if ($opts->tscan_mode() or $opts->eufind_mode()) 339 { 340 $self->{sequence} =~ s/NNNNNNNNNN/CCCCCCCCCC/g; 341 } 342 343 return 1; 344 } 345 else 346 { 347 if ($self->{saved_line} =~ /^>/) 348 { 349 $self->{seq_id}++; 350 } 351 $self->{saved_line} = <$fh>; 352 } 353 } 354 0; 355} 356 357sub read_fasta_subseq 358{ 359 my $self = shift; 360 my $target_seq_id = shift; 361 my $subseq_start = shift; 362 my $subseq_len = shift; 363 my $fh = $self->{FILE_H}; 364 365 my ($seqlen, $filepos, $curpos, $tempseq, $seq_head, $index_pos, $ct); 366 367 $self->{seq_length} = 0; 368 $self->{sequence} = ""; 369 370 # find closest position in desired sequence from file position index 371 372 $ct=0; 373 if (!defined $self->{all_seq_indices}->[$target_seq_id]) 374 { 375 $seqlen = 0; 376 $index_pos = 0; 377 } 378 else 379 { 380 while ($self->{all_seq_indices}->[$target_seq_id][$ct] < $subseq_start) 381 { 382 $ct+=2; 383 } 384 $seqlen = $self->{all_seq_indices}->[$target_seq_id][$ct-2]; 385 $index_pos = $self->{all_seq_indices}->[$target_seq_id][$ct-1]; 386 } 387 seek ($fh, $index_pos, 0); 388 389 $tempseq = ""; 390 391 # scan until I get to the sequence position 392 393 while (($seqlen < $subseq_start) && ($self->{saved_line} = <$fh>)) 394 { 395 if ($self->{saved_line} =~ /^>/) 396 { 397 return 0; 398 } 399 $self->{saved_line} =~ s/[ \n\t\d]//g; # strip whitespace & numbers 400 $seqlen += length($self->{saved_line}); 401 } 402 403 $tempseq = 'X' x $subseq_len; # pre-extending string for efficiency 404 405 $curpos = $seqlen - length($self->{saved_line}); 406 $seq_head = substr($self->{saved_line}, $subseq_start - $curpos - 1); 407 substr($tempseq, 0, length($seq_head)) = $seq_head; 408 409 $seqlen = length($seq_head); 410 411 while (($seqlen < $subseq_len) && ($self->{saved_line} = <$fh>)) 412 { 413 if ($self->{saved_line} =~ /^>/) { last; } 414 $self->{saved_line} =~ s/[ \n\t\d]//g; # strip whitespace & numbers 415 substr($tempseq, $seqlen, length($self->{saved_line})) = $self->{saved_line}; 416 $seqlen += length($self->{saved_line}); 417 } 418 419 $self->{sequence} = substr($tempseq, 0, $subseq_len); 420 421 $self->{sequence} = uc($self->{sequence}); 422 $self->{sequence} =~ s/U/T/g; 423 $self->{sequence} =~ s/X/N/g; 424 $self->{seq_length} = length($self->{sequence}); 425 return 1; 426} 427 428sub read_fasta_subseq_slow 429{ 430 my $self = shift; 431 my $opts = shift; 432 my $key = shift; 433 my $target_seq_id = shift; 434 my $subseq_start = shift; 435 my $subseq_len = shift; 436 my $fh = $self->{FILE_H}; 437 438 my ($seqlen, $filepos, $curpos, $tempseq); 439 my $last_header = ""; 440 my $seq_head = ""; 441 442 $self->{seq_length} = 0; 443 $self->{sequence} = ""; 444 445# if $key is not the global $seq_key (non-alphanumerics already 446# escaped out for $seq_key) then escape out '\' problem causing char's 447# if ($key ne $seq_key) { 448 $key =~ s/(\W)/\\$1/g; 449# } 450 451 while ((!eof(FAHANDLE)) && (($self->{saved_line} =~ /^>/) || ($self->{saved_line} = <FAHANDLE>))) 452 { 453 if (($self->{saved_line} =~ /^>\s*($key)\s+(.*)$/) || 454 ($opts->start_at_key()) && ($self->{key_found}) && 455 ($self->{saved_line} =~ /^>\s*(\S*)\s+(.*)$/o)) 456 { 457 $self->{seq_id}++; 458 459 # if searching for a particular SeqID go on to next seq 460 # if target and current seqid's don't match 461 if ($target_seq_id && ($self->{seq_id} != $target_seq_id)) 462 { 463 $self->{saved_line} = <$fh>; 464 next; 465 } 466 467 $filepos = tell($fh); # save position of last fasta header 468 $last_header = $self->{saved_line}; 469 470 $self->{key_found} = 1; 471 $self->{seq_name} = $1; 472 $self->{seq_description} = $2; 473 $self->{sequence} = ""; 474 $tempseq = ""; 475 476 $seqlen = 0; 477 while (($seqlen < $subseq_start) && ($self->{saved_line} = <$fh>)) 478 { 479 if ($self->{saved_line} =~ /^>/) { last; } 480 $self->{saved_line} =~ s/[ \n\t\d]//g; # strip whitespace & numbers 481 $seqlen += length($self->{saved_line}); 482 } 483 484 $tempseq = 'X' x $subseq_len; # pre-extending string for efficiency 485 486 $curpos = $seqlen - length($self->{saved_line}); 487 $seq_head = substr($self->{saved_line}, $subseq_start - $curpos - 1); 488 substr($tempseq, 0, length($seq_head)) = $seq_head; 489 490 $seqlen = length($seq_head); 491 492 while (($seqlen < $subseq_len) && ($self->{saved_line} = <$fh>)) 493 { 494 if ($self->{saved_line} =~ /^>/) { last; } 495 $self->{saved_line} =~ s/[ \n\t\d]//g; # strip whitespace & numbers 496 substr($tempseq, $seqlen, length($self->{saved_line})) = $self->{saved_line}; 497 $seqlen += length($self->{saved_line}); 498 } 499 500 $self->{sequence} = substr($tempseq, 0, $subseq_len); 501 502 $self->{sequence} = uc($self->{sequence}); 503 $self->{sequence} =~ s/U/T/g; 504 $self->{sequence} =~ s/X/N/g; 505 $self->{seq_length} = length($self->{sequence}); 506 seek($fh, $filepos, 0); # return file position to beginning of this seq 507 $self->{seq_id}--; # rewind seqid by 1 508 $self->{saved_line} = $last_header; # restore to original seq header line 509 return 1; 510 } 511 else 512 { 513 if ($self->{saved_line} =~ /^>/) 514 { 515 $self->{seq_id}++; 516 } 517 $self->{saved_line} = <$fh>; 518 } 519 } 520 0; 521} 522 523## read_more_fasta 524## Reads remaining portion of large fasta file (size>$MaxSeqBuffer) 525## Only reads in $MaxSeqBuffer amount or less each time 526 527sub read_more_fasta 528{ 529 my $self = shift; 530 my $opts = shift; 531 my $fh = $self->{FILE_H}; 532 533 my ($seqlen, $filepos); 534 535 $filepos = tell($fh); 536 $seqlen = 0; 537 while (($seqlen + $self->{seq_buf_overlap} < $self->{max_seq_buffer}) && ($self->{saved_line} = <$fh>)) 538 { 539 if ($self->{saved_line} =~ /^>/) { last; } 540 $self->{saved_line} =~ s/[ \n\t\d]//g; # strip whitespace & numbers 541 $seqlen += length($self->{saved_line}); 542 } 543 544 if ($seqlen == 0) 545 { 546 return 0; 547 } 548 549 seek($fh, $filepos, 0); 550 551 $self->{sequence} = $self->{buffer_overlap_seq}. 'X' x $seqlen; # pre-extending string for efficiency 552 $seqlen = length($self->{buffer_overlap_seq}); 553 554 while (($seqlen < $self->{max_seq_buffer}) && ($self->{saved_line} = <$fh>)) 555 { 556 if ($self->{saved_line} =~ /^>/) { last; } 557 $self->{saved_line} =~ s/[ \n\t\d]//g; # strip whitespace & numbers 558 substr($self->{sequence}, $seqlen, length($self->{saved_line})) = $self->{saved_line}; 559 $seqlen += length($self->{saved_line}); 560 } 561 562 # if sequence is longer than MaxSeqBuffer length, 563 # then save last ~200 nt to allow overlap with next buffer frame 564 # this prevents tRNAs on the border between buffers from being chopped 565 # in half (and missed!) 566 567 if ($seqlen >= $self->{max_seq_buffer}) 568 { 569 $self->{buffer_overlap_seq} = substr($self->{sequence}, $seqlen - $self->{seq_buf_overlap}); 570 $self->{buffer_end_index} += $seqlen - length($self->{buffer_overlap_seq}); 571 $self->{seq_buf_overrun} = 1; 572 } 573 else 574 { 575 $self->{seq_buf_overrun} = 0; 576 } 577 578 $self->{buffer_length} = length($self->{sequence}); 579 $self->{sequence} = uc($self->{sequence}); 580 $self->{sequence} =~ s/U/T/g; 581 $self->{sequence} =~ s/X/N/g; 582 583 ## Remove long runs of N's from consideration by pre-scanners 584 ## By doing this, pre-scanner false-pos rate is normal, even 585 ## when scanning unfinished genomes with long N insert "placeholders" 586 if ($opts->tscan_mode() or $opts->eufind_mode()) 587 { 588 $self->{sequence} =~ s/NNNNNNNNNN/CCCCCCCCCC/g; 589 } 590 591 return 1; 592} 593 594sub write_fasta 595{ 596 my $self = shift; 597 my $fh = $self->{FILE_H}; 598 599 my ($pos, $line); 600 601 print $fh ">$self->{seq_name} $self->{seq_description}\n"; 602 for ($pos = 0; $pos < length($self->{sequence}); $pos += 60) 603 { 604 $line = substr($self->{sequence}, $pos, 60); 605 print $fh $line, "\n"; 606 } 607} 608 609sub get_tRNA_sequence 610{ 611 my $self = shift; 612 my ($global_vars, $trna) = @_; 613 my $global_constants = $global_vars->{global_constants}; 614 my $opts = $global_vars->{options}; 615 my $log = $global_vars->{log_file}; 616 617 $self->{seq_name} = $trna->seqname(); 618 $self->{seq_description} = ""; 619 620 my ($src_seq_len, $fwd_start, $query_len, $upstream, $downstream, $tRNA_seq); 621 my $src_seqid = $self->get_seq_id_from_name($trna->seqname()); 622 623 my $upstream_len = $global_constants->get("upstream_len"); 624 my $downstream_len = $global_constants->get("downstream_len"); 625 626 if ($trna->strand() eq "+") 627 { 628 if ($trna->start() - $upstream_len <= 0) 629 { 630 $upstream_len = $trna->start() - 1; 631 } 632 $fwd_start = $trna->start() - $upstream_len; 633 $src_seq_len = $trna->end() - $trna->start() + 1; 634 } 635 else 636 { 637 if ($trna->end() < $trna->start()) 638 { 639 if ($trna->end() - $downstream_len <= 0) 640 { 641 $downstream_len = $trna->end() - 1; 642 } 643 $fwd_start = $trna->end() - $downstream_len; 644 $src_seq_len = $trna->start() - $trna->end() + 1; 645 } 646 else 647 { 648 if ($trna->start() - $downstream_len <= 0) 649 { 650 $downstream_len = $trna->start() - 1; 651 } 652 $fwd_start = $trna->start() - $downstream_len; 653 $src_seq_len = $trna->end() - $trna->start() + 1; 654 } 655 } 656 $query_len = $upstream_len + $src_seq_len + $downstream_len; 657 658 if (!$self->read_fasta_subseq($src_seqid, $fwd_start, $query_len)) 659 { 660 # if can't find it on first try, reposition 661 # to beginning of file & try once more 662 $log->broadcast("Missed ".$trna->seqname()." using quick index. Rewinding seq file and trying again with slow search..."); 663 $self->seekpos(0); 664 if (!$self->read_fasta_subseq_slow($opts, $trna->seqname(), $src_seqid, $fwd_start, $query_len)) 665 { 666 $log->warning("Could not find ".$trna->seqname()." in ".$opts->fasta_file()); 667 $log->broadcast("Skipping to next tRNA hit..."); 668 return 0; 669 } 670 } 671 672 if ($trna->strand() eq "+") 673 { 674 $downstream_len = $self->{seq_length} - $upstream_len - $src_seq_len; 675 $upstream = substr($self->{sequence}, 0, $upstream_len); 676 $downstream = ""; 677 if ($downstream_len > 0) 678 { 679 $downstream = substr($self->{sequence}, $upstream_len + $src_seq_len); 680 } 681 $tRNA_seq = substr($self->{sequence}, $upstream_len, $src_seq_len); 682 } 683 else 684 { 685 $upstream_len = $self->{seq_length} - $downstream_len - $src_seq_len; 686 $self->{sequence} = &rev_comp_seq($self->{sequence}); 687 $upstream = ""; 688 if ($upstream_len > 0) 689 { 690 $upstream = substr($self->{sequence}, 0, $upstream_len); 691 } 692 $downstream = substr($self->{sequence}, $upstream_len + $src_seq_len); 693 $tRNA_seq = substr($self->{sequence}, $upstream_len, $src_seq_len); 694 } 695 696 $trna->seq($tRNA_seq); 697 $trna->upstream($upstream); 698 $trna->downstream($downstream); 699} 700 701sub mask_out_sequence 702{ 703 my $self = shift; 704 my ($seq_file, $temp_seq_file, $tRNA_hits) = @_; 705 706 my $fh_seq_in = undef; 707 my $fh_seq_out = undef; 708 my $line = ""; 709 my $last_line = ""; 710 my $seqname = ""; 711 my %cms_hits = (); 712 my $hits = []; 713 my $ct = 0; 714 my $written_len = 0; 715 my $seq_start = 0; 716 my $subseq_start = 0; 717 my $subseq_len = 0; 718 my $N_start = 0; 719 720 for (my $i = 0; $i < $tRNA_hits->get_count(); $i++) 721 { 722 if (defined $cms_hits{$tRNA_hits->get($i)->seqname()}) 723 { 724 push (@{$cms_hits{$tRNA_hits->get($i)->seqname()}}, $tRNA_hits->get($i)); 725 } 726 else 727 { 728 $hits = []; 729 push (@$hits, $tRNA_hits->get($i)); 730 $cms_hits{$tRNA_hits->get($i)->seqname()} = $hits; 731 } 732 } 733 734 &open_for_read(\$fh_seq_in, $seq_file); 735 &open_for_write(\$fh_seq_out, $temp_seq_file); 736 737 while ($line = <$fh_seq_in>) 738 { 739 chomp($line); 740 if ($line =~ /^>([^\t]+)$/) 741 { 742 $seqname = $1; 743 $seqname = &trim($seqname); 744 if (index($seqname, ' ') > -1) 745 { 746 $seqname = substr($seqname, 0, index($seqname, ' ')); 747 } 748 $hits = undef; 749 $ct = 0; 750 if (defined $cms_hits{$seqname}) 751 { 752 $hits = $cms_hits{$seqname}; 753 $subseq_len = $hits->[$ct]->len(); 754 $seq_start = $hits->[$ct]->start(); 755 } 756 $written_len = 0; 757 $N_start = 0; 758 if ($last_line ne "") 759 { 760 print $fh_seq_out $last_line . "\n"; 761 $last_line = ""; 762 } 763 print $fh_seq_out $line . "\n"; 764 } 765 elsif ($line =~ /^\s*$/) 766 {} 767 else 768 { 769 if ($last_line ne "") 770 { 771 $line = $last_line . $line; 772 $last_line = ""; 773 } 774 if (defined $hits) 775 { 776 if ($ct < scalar(@$hits)) 777 { 778 if ($written_len + length($line) < $seq_start) 779 { 780 print $fh_seq_out $line . "\n"; 781 $written_len += length($line); 782 } 783 else 784 { 785 if ($N_start > 0) 786 { 787 $subseq_start = 1; 788 } 789 else 790 { 791 $subseq_start = $seq_start - $written_len; 792 print $fh_seq_out substr($line, 0, $subseq_start - 1); 793 $written_len += ($subseq_start - 1); 794 } 795 if (length($line) >= ($subseq_start + $subseq_len - 1)) 796 { 797 print $fh_seq_out 'N' x $subseq_len; 798 print $fh_seq_out "\n"; 799 $written_len += $subseq_len; 800 if (length($line) > ($subseq_start + $subseq_len - 1)) 801 { 802 $last_line = substr($line, $subseq_start + $subseq_len - 1); 803 } 804 $N_start = 0; 805 $ct++; 806 if ($ct < scalar(@$hits)) 807 { 808 $subseq_len = $hits->[$ct]->len(); 809 $seq_start = $hits->[$ct]->start(); 810 } 811 } 812 else 813 { 814 print $fh_seq_out 'N' x (length($line) - $subseq_start + 1); 815 print $fh_seq_out "\n"; 816 $written_len += (length($line) - $subseq_start + 1); 817 $N_start = 1; 818 $subseq_len -= (length($line) - $subseq_start + 1); 819 } 820 } 821 } 822 else 823 { 824 print $fh_seq_out $line . "\n"; 825 $written_len += length($line); 826 } 827 } 828 else 829 { 830 print $fh_seq_out $line . "\n"; 831 $written_len += length($line); 832 } 833 } 834 } 835 836 close($fh_seq_in); 837 close($fh_seq_out); 838} 839 8401; 841