1# tRNAscanSE/Sequence.pm
2# This class describes a sequence and provides functions for handling fasta files in tRNAscan-SE.
3#
4# --------------------------------------------------------------
5# This module is part of the tRNAscan-SE program.
6# Copyright (C) 2017 Patricia Chan and Todd Lowe
7# --------------------------------------------------------------
8#
9# Perl code for reading FASTA-formatted sequence files
10# SRE, Sat Feb 19 19:10:43 1994
11
12# These subroutines read a FASTA formatted file one sequence at a time.
13# Open(filename, open_mode) opens a file for reading or wrting.
14# Close() closes it when you're done.
15#
16# read_fasta() returns 1 on success and 0 on failure (end of file).
17# When it returns success, the following variables are set:
18#
19#       $seq_name        = name of sequence (1st word on FASTA title line)
20#       $seq_description = description      (remainder of FASTA title line)
21#       $seq_length      = length of sequence
22#       $sequence        = sequence, gaps and newlines removed
23#
24# Modified by TMJL  11/95 for use in tRNAscan-SE
25
26package tRNAscanSE::Sequence;
27
28use strict;
29use tRNAscanSE::Utils;
30use tRNAscanSE::Configuration;
31use tRNAscanSE::Options;
32use tRNAscanSE::tRNA;
33use tRNAscanSE::ArraytRNA;
34use tRNAscanSE::LogFile;
35
36sub new
37{
38    my $class = shift;
39    my $self = {};
40
41    initialize($self);
42
43    bless ($self, $class);
44    return $self;
45}
46
47sub DESTROY
48{
49    my $self = shift;
50}
51
52sub initialize
53{
54    my $self = shift;
55    $self->{file_name} = "";             # name of log file
56    $self->{FILE_H} = undef;             # file handle
57
58    $self->{max_seq_buffer} = 50000000;    # Max size of seq buffer read in at once
59    $self->{seq_buf_overlap} = 200;        # Nucleotides of overlap between buffers
60    $self->{seq_index_inc} = 100000;
61
62    $self->{saved_line} = "";
63    $self->{buffer_overlap_seq} = "";
64    $self->{buffer_end_index} = 0;
65    $self->{seq_buf_overrun} = 0;
66    $self->{buffer_length} = 0;
67    $self->{key_found} = 0;
68    $self->{all_seq_indices} = +[];        # Keeps track of indexing into seqs for fast retreival
69
70    $self->{seq_id} = 0;
71    $self->{seq_name} = "";
72    $self->{seq_description} = "";
73    $self->{seq_length} = 0;
74    $self->{sequence} = undef;
75
76    $self->{seq_name_map} = {};
77}
78
79sub file_name
80{
81    my $self = shift;
82    if (@_) { $self->{file_name} = shift; }
83    return $self->{file_name};
84}
85
86sub key_found
87{
88    my $self = shift;
89    if (@_) { $self->{key_found} = shift; }
90    return $self->{key_found};
91}
92
93sub seq_id
94{
95    my $self = shift;
96    if (@_) { $self->{seq_id} = shift; }
97    return $self->{seq_id};
98}
99
100sub seq_name
101{
102    my $self = shift;
103    if (@_) { $self->{seq_name} = shift; }
104    return $self->{seq_name};
105}
106
107sub get_seq_id_from_name
108{
109    my $self = shift;
110    my $name = shift;
111    my $id = -1;
112    if (defined $self->{seq_name_map}->{$name})
113    {
114        $id = $self->{seq_name_map}->{$name};
115    }
116    return $id;
117}
118
119sub seq_description
120{
121    my $self = shift;
122    if (@_) { $self->{seq_description} = shift; }
123    return $self->{seq_description};
124}
125
126sub seq_length
127{
128    my $self = shift;
129    if (@_) { $self->{seq_length} = shift; }
130    return $self->{seq_length};
131}
132
133sub sequence
134{
135    my $self = shift;
136    if (@_) { $self->{sequence} = shift; }
137    return $self->{sequence};
138}
139
140sub release_memory
141{
142    my $self = shift;
143    undef($self->{sequence});
144}
145
146sub set_seq_info
147{
148    my $self = shift;
149    $self->{seq_name} = shift;
150    $self->{seq_description} = shift;
151    $self->{seq_length} = shift;
152    $self->{sequence} = shift;
153}
154
155sub reset_buffer_ct
156{
157    my $self = shift;
158    $self->{buffer_overlap_seq} = "";
159    $self->{buffer_end_index} = 0;
160    $self->{seq_buf_overrun} = 0;
161}
162
163sub buffer_end_index
164{
165    my $self = shift;
166    if (@_) { $self->{buffer_end_index} = shift; }
167    return $self->{buffer_end_index};
168}
169
170sub seq_buf_overrun
171{
172    my $self = shift;
173    if (@_) { $self->{seq_buf_overrun} = shift; }
174    return $self->{seq_buf_overrun};
175}
176
177sub seekpos
178{
179    my $self = shift;
180    my $pos = shift;
181
182    seek($self->{FILE_H}, $pos, 0);
183}
184
185sub open_file
186{
187    my $self = shift;
188    my $file = shift;
189    my $mode = shift;
190
191    my $success = 0;
192
193    if ($mode eq "read")
194    {
195        &open_for_read(\$self->{FILE_H}, $file);
196        $self->{seq_id} = 0;
197        $self->{saved_line} = "";
198    }
199    elsif ($mode eq "write")
200    {
201        &open_for_write(\$self->{FILE_H}, $file);
202    }
203    elsif ($mode eq "append")
204    {
205        &open_for_append(\$self->{FILE_H}, $file);
206    }
207    $self->{file_name} = $file;
208    $success = 1;
209
210    return $success;
211}
212
213sub close_file
214{
215    my $self = shift;
216
217    if (defined $self->{FILE_H})
218    {
219        close($self->{FILE_H});
220    }
221}
222
223# Reads length of sequence first, then pre-extends to total length
224# before reading it in (important optimization for very long sequences)
225# Also, will search for sequence name matching $key
226sub read_fasta
227{
228    my $self = shift;
229    my $opts = shift;
230    my $target_seq_id = shift;
231    my $key = $opts->seq_key();
232    my $fh = $self->{FILE_H};
233
234    my ($seqlen, $filepos, $pre_extend_len, $seq_index_step, @seq_index);
235
236    $self->{seq_name} = "";
237    $self->{seq_description} = "";
238    $self->{seq_length} = 0;
239    $self->{sequence} = "";
240
241# if $key is not the global $seq_key (non-alphanumerics already
242#  escaped out for $seq_key) then escape out '\' problem causing char's
243#    if ($key ne $seq_key) {
244#        $key =~ s/(\W)/\\$1/g;
245#    }
246
247    while ((!eof($fh)) && (($self->{saved_line} =~ /^>/) || ($self->{saved_line} = <$fh>)))
248    {
249        if (($self->{saved_line} =~ /^>\s*($key)\s+(.*)$/) ||
250            ($opts->start_at_key()) && ($self->{key_found}) &&
251            ($self->{saved_line} =~ /^>\s*(\S*)\s+(.*)$/o))
252        {
253            $self->{seq_id}++;
254
255            # if searching for a particular SeqID go on to next seq
256            #  if target and current seqid's don't match
257            if ($target_seq_id && ($self->{seq_id} != $target_seq_id))
258            {
259                $self->{saved_line} = <$fh>;
260                next;
261            }
262
263            $self->{key_found}       = 1;
264            $self->{seq_name}        = $1;
265            $self->{seq_description} = $2;
266            $self->{sequence}        = "";
267            $self->{seq_name_map}->{$self->{seq_name}} = $self->{seq_id};
268
269            @seq_index        = ();
270            $seq_index_step   = $self->{seq_index_inc};   # set first bp position to save
271
272            $filepos = tell($fh);
273            $seqlen = 0;
274            push(@seq_index, $seqlen, tell($fh));
275            $pre_extend_len = 0;
276#            print LOGFILE "At pos: ";
277
278            while ($self->{saved_line} = <$fh>)
279            {
280                if ($self->{saved_line} =~ /^>/) { last; }
281                $self->{saved_line} =~ s/[ \n\t\d]//g;     # strip whitespace & numbers
282                $seqlen += length($self->{saved_line});
283
284                # Save the start position of this chunk of seq for later easy return
285                if ($seqlen > $seq_index_step)
286                {
287                    push(@seq_index, $seqlen, tell($fh));
288                    $seq_index_step += $self->{seq_index_inc};
289#                    print LOGFILE "($Seqlen) ";
290                }
291
292                if (($pre_extend_len == 0) && ($seqlen >= $self->{max_seq_buffer}))
293                {
294                    $pre_extend_len = $seqlen;
295                }
296            }
297            push(@seq_index, $seqlen, tell($fh));
298            $self->{seq_length} = $seqlen;
299#            print LOGFILE " ";
300
301            $self->{all_seq_indices}->[$self->{seq_id}] = [@seq_index];
302
303            seek($fh,$filepos,0);
304            $self->{sequence} = 'X' x $pre_extend_len;  # pre-extending string for efficiency
305            $seqlen = 0;
306            while (($seqlen < $self->{max_seq_buffer}) && ($self->{saved_line} = <$fh>))
307            {
308                if ($self->{saved_line} =~ /^>/) { last; }
309                $self->{saved_line} =~ s/[ \n\t\d]//g;     # strip whitespace & numbers
310                substr($self->{sequence}, $seqlen, length($self->{saved_line})) = $self->{saved_line};
311                $seqlen += length($self->{saved_line});
312            }
313
314            # if sequence is longer than MaxSeqBuffer length,
315            # then save last ~200 nt to allow overlap with next buffer frame
316            # this prevents tRNAs on the border between buffers from being chopped
317            # in half (and missed!)
318
319            if ($seqlen >= $self->{max_seq_buffer})
320            {
321                $self->{buffer_overlap_seq} = substr($self->{sequence}, $seqlen - $self->{seq_buf_overlap});
322                $self->{buffer_end_index}   = $seqlen - length($self->{buffer_overlap_seq});
323                $self->{seq_buf_overrun} = 1;
324            }
325            else
326            {
327                $self->{seq_buf_overrun} = 0;
328            }
329
330            $self->{buffer_length} = length($self->{sequence});
331            $self->{sequence} = uc($self->{sequence});
332            $self->{sequence} =~ s/U/T/g;
333            $self->{sequence} =~ s/X/N/g;
334
335            ## Remove long runs of N's from consideration by pre-scanners
336            ## By doing this, pre-scanner false-pos rate is normal, even
337            ## when scanning unfinished genomes with long N insert "placeholders"
338            if ($opts->tscan_mode() or $opts->eufind_mode())
339            {
340                $self->{sequence} =~ s/NNNNNNNNNN/CCCCCCCCCC/g;
341            }
342
343            return 1;
344        }
345        else
346        {
347            if ($self->{saved_line} =~ /^>/)
348            {
349                $self->{seq_id}++;
350            }
351            $self->{saved_line} = <$fh>;
352        }
353    }
354    0;
355}
356
357sub read_fasta_subseq
358{
359    my $self = shift;
360    my $target_seq_id = shift;
361    my $subseq_start = shift;
362    my $subseq_len = shift;
363    my $fh = $self->{FILE_H};
364
365    my ($seqlen, $filepos, $curpos, $tempseq, $seq_head, $index_pos, $ct);
366
367    $self->{seq_length} = 0;
368    $self->{sequence} = "";
369
370    # find closest position in desired sequence from file position index
371
372    $ct=0;
373    if (!defined $self->{all_seq_indices}->[$target_seq_id])
374    {
375        $seqlen = 0;
376        $index_pos = 0;
377    }
378    else
379    {
380        while ($self->{all_seq_indices}->[$target_seq_id][$ct] < $subseq_start)
381        {
382            $ct+=2;
383        }
384        $seqlen     = $self->{all_seq_indices}->[$target_seq_id][$ct-2];
385        $index_pos  = $self->{all_seq_indices}->[$target_seq_id][$ct-1];
386    }
387    seek ($fh, $index_pos, 0);
388
389    $tempseq = "";
390
391    # scan until I get to the sequence position
392
393    while (($seqlen < $subseq_start) && ($self->{saved_line} = <$fh>))
394    {
395        if ($self->{saved_line} =~ /^>/)
396        {
397            return 0;
398        }
399        $self->{saved_line} =~ s/[ \n\t\d]//g;     # strip whitespace & numbers
400        $seqlen += length($self->{saved_line});
401    }
402
403    $tempseq = 'X' x $subseq_len;  # pre-extending string for efficiency
404
405    $curpos = $seqlen - length($self->{saved_line});
406    $seq_head = substr($self->{saved_line}, $subseq_start - $curpos - 1);
407    substr($tempseq, 0, length($seq_head)) = $seq_head;
408
409    $seqlen = length($seq_head);
410
411    while (($seqlen < $subseq_len) && ($self->{saved_line} = <$fh>))
412    {
413        if ($self->{saved_line} =~ /^>/) { last; }
414        $self->{saved_line} =~ s/[ \n\t\d]//g;     # strip whitespace & numbers
415        substr($tempseq, $seqlen, length($self->{saved_line})) = $self->{saved_line};
416        $seqlen += length($self->{saved_line});
417    }
418
419    $self->{sequence} = substr($tempseq, 0, $subseq_len);
420
421    $self->{sequence} = uc($self->{sequence});
422    $self->{sequence} =~ s/U/T/g;
423    $self->{sequence} =~ s/X/N/g;
424    $self->{seq_length} = length($self->{sequence});
425    return 1;
426}
427
428sub read_fasta_subseq_slow
429{
430    my $self = shift;
431    my $opts = shift;
432    my $key = shift;
433    my $target_seq_id = shift;
434    my $subseq_start = shift;
435    my $subseq_len = shift;
436    my $fh = $self->{FILE_H};
437
438    my ($seqlen, $filepos, $curpos, $tempseq);
439    my $last_header = "";
440    my $seq_head = "";
441
442    $self->{seq_length} = 0;
443    $self->{sequence} = "";
444
445# if $key is not the global $seq_key (non-alphanumerics already
446#  escaped out for $seq_key) then escape out '\' problem causing char's
447#  if ($key ne $seq_key) {
448        $key =~ s/(\W)/\\$1/g;
449#    }
450
451    while ((!eof(FAHANDLE)) && (($self->{saved_line} =~ /^>/) || ($self->{saved_line} = <FAHANDLE>)))
452    {
453        if (($self->{saved_line} =~ /^>\s*($key)\s+(.*)$/) ||
454            ($opts->start_at_key()) && ($self->{key_found}) &&
455            ($self->{saved_line} =~ /^>\s*(\S*)\s+(.*)$/o))
456        {
457            $self->{seq_id}++;
458
459            # if searching for a particular SeqID go on to next seq
460            #  if target and current seqid's don't match
461            if ($target_seq_id && ($self->{seq_id} != $target_seq_id))
462            {
463                $self->{saved_line} = <$fh>;
464                next;
465            }
466
467            $filepos = tell($fh);  # save position of last fasta header
468            $last_header = $self->{saved_line};
469
470            $self->{key_found} = 1;
471            $self->{seq_name}        = $1;
472            $self->{seq_description} = $2;
473            $self->{sequence}        = "";
474            $tempseq = "";
475
476            $seqlen = 0;
477            while (($seqlen < $subseq_start) && ($self->{saved_line} = <$fh>))
478            {
479                if ($self->{saved_line} =~ /^>/) { last; }
480                $self->{saved_line} =~ s/[ \n\t\d]//g;     # strip whitespace & numbers
481                $seqlen += length($self->{saved_line});
482            }
483
484            $tempseq = 'X' x $subseq_len;  # pre-extending string for efficiency
485
486            $curpos = $seqlen - length($self->{saved_line});
487            $seq_head = substr($self->{saved_line}, $subseq_start - $curpos - 1);
488            substr($tempseq, 0, length($seq_head)) = $seq_head;
489
490            $seqlen = length($seq_head);
491
492            while (($seqlen < $subseq_len) && ($self->{saved_line} = <$fh>))
493            {
494                if ($self->{saved_line} =~ /^>/) { last; }
495                $self->{saved_line} =~ s/[ \n\t\d]//g;      # strip whitespace & numbers
496                substr($tempseq, $seqlen, length($self->{saved_line})) = $self->{saved_line};
497                $seqlen += length($self->{saved_line});
498            }
499
500            $self->{sequence} = substr($tempseq, 0, $subseq_len);
501
502            $self->{sequence} = uc($self->{sequence});
503            $self->{sequence} =~ s/U/T/g;
504            $self->{sequence} =~ s/X/N/g;
505            $self->{seq_length} = length($self->{sequence});
506            seek($fh, $filepos, 0);                 # return file position to beginning of this seq
507            $self->{seq_id}--;                      # rewind seqid by 1
508            $self->{saved_line} = $last_header;             # restore to original seq header line
509            return 1;
510        }
511        else
512        {
513            if ($self->{saved_line} =~ /^>/)
514            {
515                $self->{seq_id}++;
516            }
517            $self->{saved_line} = <$fh>;
518        }
519    }
520    0;
521}
522
523## read_more_fasta
524## Reads remaining portion of large fasta file (size>$MaxSeqBuffer)
525## Only reads in $MaxSeqBuffer amount or less each time
526
527sub read_more_fasta
528{
529    my $self = shift;
530    my $opts = shift;
531    my $fh = $self->{FILE_H};
532
533    my ($seqlen, $filepos);
534
535    $filepos = tell($fh);
536    $seqlen = 0;
537    while (($seqlen + $self->{seq_buf_overlap} < $self->{max_seq_buffer}) && ($self->{saved_line} = <$fh>))
538    {
539        if ($self->{saved_line} =~ /^>/) { last; }
540        $self->{saved_line} =~ s/[ \n\t\d]//g;     # strip whitespace & numbers
541        $seqlen += length($self->{saved_line});
542    }
543
544    if ($seqlen == 0)
545    {
546        return 0;
547    }
548
549    seek($fh, $filepos, 0);
550
551    $self->{sequence} = $self->{buffer_overlap_seq}. 'X' x $seqlen;  # pre-extending string for efficiency
552    $seqlen = length($self->{buffer_overlap_seq});
553
554    while (($seqlen < $self->{max_seq_buffer}) && ($self->{saved_line} = <$fh>))
555    {
556        if ($self->{saved_line} =~ /^>/) { last; }
557        $self->{saved_line} =~ s/[ \n\t\d]//g;     # strip whitespace & numbers
558        substr($self->{sequence}, $seqlen, length($self->{saved_line})) = $self->{saved_line};
559        $seqlen += length($self->{saved_line});
560    }
561
562    # if sequence is longer than MaxSeqBuffer length,
563    # then save last ~200 nt to allow overlap with next buffer frame
564    # this prevents tRNAs on the border between buffers from being chopped
565    # in half (and missed!)
566
567    if ($seqlen >= $self->{max_seq_buffer})
568    {
569        $self->{buffer_overlap_seq} = substr($self->{sequence}, $seqlen - $self->{seq_buf_overlap});
570        $self->{buffer_end_index}   += $seqlen - length($self->{buffer_overlap_seq});
571        $self->{seq_buf_overrun} = 1;
572    }
573    else
574    {
575        $self->{seq_buf_overrun} = 0;
576    }
577
578    $self->{buffer_length} = length($self->{sequence});
579    $self->{sequence} = uc($self->{sequence});
580    $self->{sequence} =~ s/U/T/g;
581    $self->{sequence} =~ s/X/N/g;
582
583    ## Remove long runs of N's from consideration by pre-scanners
584    ## By doing this, pre-scanner false-pos rate is normal, even
585    ## when scanning unfinished genomes with long N insert "placeholders"
586    if ($opts->tscan_mode() or $opts->eufind_mode())
587    {
588        $self->{sequence} =~ s/NNNNNNNNNN/CCCCCCCCCC/g;
589    }
590
591    return 1;
592}
593
594sub write_fasta
595{
596    my $self = shift;
597    my $fh = $self->{FILE_H};
598
599    my ($pos, $line);
600
601    print $fh ">$self->{seq_name} $self->{seq_description}\n";
602    for ($pos = 0; $pos < length($self->{sequence}); $pos += 60)
603    {
604        $line = substr($self->{sequence}, $pos, 60);
605        print $fh $line, "\n";
606    }
607}
608
609sub get_tRNA_sequence
610{
611    my $self = shift;
612    my ($global_vars, $trna) = @_;
613    my $global_constants = $global_vars->{global_constants};
614    my $opts = $global_vars->{options};
615    my $log = $global_vars->{log_file};
616
617    $self->{seq_name} = $trna->seqname();
618    $self->{seq_description} = "";
619
620    my ($src_seq_len, $fwd_start, $query_len, $upstream, $downstream, $tRNA_seq);
621    my $src_seqid = $self->get_seq_id_from_name($trna->seqname());
622
623    my $upstream_len   = $global_constants->get("upstream_len");
624    my $downstream_len = $global_constants->get("downstream_len");
625
626    if ($trna->strand() eq "+")
627    {
628        if ($trna->start() - $upstream_len <= 0)
629        {
630            $upstream_len = $trna->start() - 1;
631        }
632        $fwd_start = $trna->start() - $upstream_len;
633        $src_seq_len = $trna->end() - $trna->start() + 1;
634    }
635    else
636    {
637        if ($trna->end() < $trna->start())
638        {
639            if ($trna->end() - $downstream_len <= 0)
640            {
641                $downstream_len = $trna->end() - 1;
642            }
643            $fwd_start = $trna->end() - $downstream_len;
644            $src_seq_len = $trna->start() - $trna->end() + 1;
645        }
646        else
647        {
648            if ($trna->start() - $downstream_len <= 0)
649            {
650                $downstream_len = $trna->start() - 1;
651            }
652            $fwd_start = $trna->start() - $downstream_len;
653            $src_seq_len = $trna->end() - $trna->start() + 1;
654        }
655    }
656    $query_len = $upstream_len + $src_seq_len + $downstream_len;
657
658    if (!$self->read_fasta_subseq($src_seqid, $fwd_start, $query_len))
659    {
660        # if can't find it on first try, reposition
661        # to beginning of file & try once more
662        $log->broadcast("Missed ".$trna->seqname()." using quick index. Rewinding seq file and trying again with slow search...");
663        $self->seekpos(0);
664        if (!$self->read_fasta_subseq_slow($opts, $trna->seqname(), $src_seqid, $fwd_start, $query_len))
665        {
666            $log->warning("Could not find ".$trna->seqname()." in ".$opts->fasta_file());
667            $log->broadcast("Skipping to next tRNA hit...");
668            return 0;
669        }
670    }
671
672    if ($trna->strand() eq "+")
673    {
674        $downstream_len = $self->{seq_length} - $upstream_len - $src_seq_len;
675        $upstream = substr($self->{sequence}, 0, $upstream_len);
676        $downstream = "";
677        if ($downstream_len > 0)
678        {
679            $downstream = substr($self->{sequence}, $upstream_len + $src_seq_len);
680        }
681        $tRNA_seq = substr($self->{sequence}, $upstream_len, $src_seq_len);
682    }
683    else
684    {
685        $upstream_len = $self->{seq_length} - $downstream_len - $src_seq_len;
686        $self->{sequence} = &rev_comp_seq($self->{sequence});
687        $upstream = "";
688        if ($upstream_len > 0)
689        {
690            $upstream = substr($self->{sequence}, 0, $upstream_len);
691        }
692        $downstream = substr($self->{sequence}, $upstream_len + $src_seq_len);
693        $tRNA_seq = substr($self->{sequence}, $upstream_len, $src_seq_len);
694    }
695
696    $trna->seq($tRNA_seq);
697    $trna->upstream($upstream);
698    $trna->downstream($downstream);
699}
700
701sub mask_out_sequence
702{
703    my $self = shift;
704    my ($seq_file, $temp_seq_file, $tRNA_hits) = @_;
705
706    my $fh_seq_in = undef;
707    my $fh_seq_out = undef;
708    my $line = "";
709    my $last_line = "";
710    my $seqname = "";
711    my %cms_hits = ();
712    my $hits = [];
713    my $ct = 0;
714    my $written_len = 0;
715    my $seq_start = 0;
716    my $subseq_start = 0;
717    my $subseq_len = 0;
718    my $N_start = 0;
719
720    for (my $i = 0; $i < $tRNA_hits->get_count(); $i++)
721    {
722        if (defined $cms_hits{$tRNA_hits->get($i)->seqname()})
723        {
724            push (@{$cms_hits{$tRNA_hits->get($i)->seqname()}}, $tRNA_hits->get($i));
725        }
726        else
727        {
728            $hits = [];
729            push (@$hits, $tRNA_hits->get($i));
730            $cms_hits{$tRNA_hits->get($i)->seqname()} = $hits;
731        }
732    }
733
734    &open_for_read(\$fh_seq_in, $seq_file);
735    &open_for_write(\$fh_seq_out, $temp_seq_file);
736
737    while ($line = <$fh_seq_in>)
738    {
739        chomp($line);
740        if ($line =~ /^>([^\t]+)$/)
741        {
742            $seqname = $1;
743            $seqname = &trim($seqname);
744            if (index($seqname, ' ') > -1)
745            {
746                $seqname = substr($seqname, 0, index($seqname, ' '));
747            }
748            $hits = undef;
749            $ct = 0;
750            if (defined $cms_hits{$seqname})
751            {
752                $hits = $cms_hits{$seqname};
753                $subseq_len = $hits->[$ct]->len();
754                $seq_start = $hits->[$ct]->start();
755            }
756            $written_len = 0;
757            $N_start = 0;
758            if ($last_line ne "")
759            {
760                print $fh_seq_out $last_line . "\n";
761                $last_line = "";
762            }
763            print $fh_seq_out $line . "\n";
764        }
765        elsif ($line =~ /^\s*$/)
766        {}
767        else
768        {
769            if ($last_line ne "")
770            {
771                $line = $last_line . $line;
772                $last_line = "";
773            }
774            if (defined $hits)
775            {
776                if ($ct < scalar(@$hits))
777                {
778                    if ($written_len + length($line) < $seq_start)
779                    {
780                        print $fh_seq_out $line . "\n";
781                        $written_len += length($line);
782                    }
783                    else
784                    {
785                        if ($N_start > 0)
786                        {
787                            $subseq_start = 1;
788                        }
789                        else
790                        {
791                            $subseq_start = $seq_start - $written_len;
792                            print $fh_seq_out substr($line, 0, $subseq_start - 1);
793                            $written_len += ($subseq_start - 1);
794                        }
795                        if (length($line) >= ($subseq_start + $subseq_len - 1))
796                        {
797                            print $fh_seq_out 'N' x $subseq_len;
798                            print $fh_seq_out "\n";
799                            $written_len += $subseq_len;
800                            if (length($line) > ($subseq_start + $subseq_len - 1))
801                            {
802                                $last_line = substr($line, $subseq_start + $subseq_len - 1);
803                            }
804                            $N_start = 0;
805                            $ct++;
806                            if ($ct < scalar(@$hits))
807                            {
808                                $subseq_len = $hits->[$ct]->len();
809                                $seq_start = $hits->[$ct]->start();
810                            }
811                        }
812                        else
813                        {
814                            print $fh_seq_out 'N' x (length($line) - $subseq_start + 1);
815                            print $fh_seq_out "\n";
816                            $written_len += (length($line) - $subseq_start + 1);
817                            $N_start = 1;
818                            $subseq_len -= (length($line) - $subseq_start + 1);
819                        }
820                    }
821                }
822                else
823                {
824                    print $fh_seq_out $line . "\n";
825                    $written_len += length($line);
826                }
827            }
828            else
829            {
830                print $fh_seq_out $line . "\n";
831                $written_len += length($line);
832            }
833        }
834    }
835
836    close($fh_seq_in);
837    close($fh_seq_out);
838}
839
8401;
841