1############################################################################
2# demotic_fasta package
3#    Parses fasta or ssearch output, stores extracted information in convenient vars.
4#    SRE, Wed Jun 25 13:41:41 2003
5#
6############################################################################
7
8package demotic_fasta;
9
10# parse(\*STDIN) would parse ssearch output
11# coming in via stdin.
12#
13sub parse (*) {
14    my $fh = shift;
15    my $parsing_header  = 1;
16    my $parsing_hitlist = 0;
17    my $parsing_alilist = 0;
18    my $target;
19    my $alilinecount = 0;
20    my $prvaliline_isquery = 0;
21    my $ali_qline;
22    my $ali_tline;
23    my $ali_qasq;
24    my $ali_tasq;
25    my $margin;
26
27    # Initialize everything... so we can call the parser
28    # repeatedly, one call per ssearch output.
29    #
30    # This section also documents what's available in
31    # variables within the package.
32    #
33    # Arrays are indexed [0..nhits-1] or [0..nali-1]
34    #
35    $queryname      = "";	# Name of the query sequence
36    $querydesc      = "";	# Description line for the query (or "")
37    $querylen       = 0;	# Length of the query in residues
38    $db             = "";	# Name of the target database file
39    $db_nseq        = 0;	# Number of sequences in the target database
40    $db_nletters    = "";	# Number of residues in the target database
41                                # (stored as a string so we can have large #'s)
42
43				# The top hit list (still in rank order)
44    $nhits          = 0;	# Number of entries in the hit list
45    @hit_target     = ();	# Target sequence name (by rank)
46    %target_desc    = ();	# Target sequence description (by targ name)
47    %target_len     = ();	# Length of target sequence
48    @hit_score      = ();	# Raw score (by rank)
49    @hit_bitscore   = ();	# Bit score (by rank)
50    @hit_Eval       = ();	# E-value (by rank)
51
52				# The alignment output (in order it came in)
53				# all indexed by # of alignment [0..nali-1]
54    $nali           = 0;	# Number of alignments
55    @ali_target     = ();	# Target sequence name
56    @ali_score      = ();	# Smith/Waterman raw score of alignment
57    @ali_bitscore   = ();	# bit score
58    @ali_evalue     = ();	# E-value
59    @ali_nident     = ();	# Number of identical residues
60    @ali_alen       = ();	# Length of alignment (overlap)
61    @ali_identity   = ();	# Percent identity
62#    @ali_npos       = (); # Number of positives (similar positions)
63#    @ali_positive   = (); # Percent of positions matched or similar
64    @ali_qstart     = ();	# Start position on query
65    @ali_qend       = ();	# End position on query
66    @ali_tstart     = ();	# Start position on target
67    @ali_tend       = ();	# End position on target
68    @ali_qali       = (); # Aligned string from query
69    @ali_tali       = (); # Aligned string from target (subject)
70    @ali_qmask      = (); # line in between the two aligned strings
71    @ali_tmask      = (); # line in between the two aligned strings
72    @ali_hitidx     = ();       # index of hit
73    $hitidx = -1;
74
75    if (defined($save_querycount) && $save_querycount > 1) { # on subsequent queries, we had to use the >>> start line to detect
76	# the end of the prev query; we socked the necessary info away in tmp vars.
77	$querycount = $save_querycount;
78	$queryname  = $save_queryname;
79	$querydesc  = $save_querydesc;
80	$querylen   = $save_querylen;
81    }
82
83    # Now, loop over the lines of our input, and parse 'em.
84    #
85    my $line;
86    my $prvline;
87    while (<$fh>) {
88	$line = $_;
89	if ($parsing_header) {
90	    if (/^The best scores are:/) { # start of hit list
91		$parsing_header  = 0;
92		$parsing_hitlist = 1;
93		next;
94	    } elsif (/^!! No sequences/) { # no hit list: no hits; just return
95		return 1;	# return "success"
96	    } elsif (/^\s+(\d+)>>>\s*(\S*)\s*(.*)\s*-\s*(\d+) \S\S$/) { # allows blank query. \S\S is "nt" or "aa"
97		$querycount = $1;
98		$queryname  = $2;
99		$querydesc  = $3;
100		$querylen   = $4;
101		if ($queryname eq "") {
102		    $queryname = "unnamed_query";
103		}
104	    } elsif (/^\s+(\d+)\s+residues in\s+(\d+)\s+sequences\s*$/) {
105		$db_nletters = $1;
106		$db_nseq     = $2;
107	    }
108	}
109	elsif ($parsing_hitlist) {
110	    if (/^\s*$/) {	# blank line marks end of hit list, start of alignments
111		$parsing_hitlist = 0;
112		$parsing_alilist = 1;
113		next;
114	    } elsif (/^(\S+)\s*(.*\S?)\s*\(\s*(\d+)\)\s+(\d+)\s+(\S+)\s+(\S+)\s*$/) {
115		$hit_target[$nhits]    = $1;
116		$target_desc{$1}       = $2;
117	        $target_len{$1}        = $3;
118		$hit_score[$nhits]     = $4;
119		$hit_bitscore[$nhits]  = $5;
120		$hit_Eval[$nhits]      = $6;
121		$nhits++;
122	    }
123	}
124	elsif ($parsing_alilist) {
125	    if (/^>>(\S+)\s*(.*)\s+\((\d+) \S\S\)\s*$/) {  # the \S\S is either nt or aa
126		$target = $1;
127		$hitidx ++;
128		$target_desc{$target} = $2;
129		if ($3 != $target_len{$target}) { die "can't happen.", "1)", $3, "2)", $target_len{$target}; }
130	    }
131	    elsif (/^ s-w opt:\s+(\d+)\s+Z-score:\s*(\S+)\s+bits:\s+(\S+)\s+E\(\d*\):\s+(\S+)\s*$/) {  # SSEARCH
132		$nali++;
133		$ali_target[$nali-1]   = $target;
134		$ali_score[$nali-1]    = $1;
135		$ali_bitscore[$nali-1] = $3;
136		$ali_evalue[$nali-1]   = $4;
137		$ali_hitidx[$nali-1]   = $hitidx;
138	    }
139	    elsif (/^ initn:\s*\d+\s*init1:\s*\d+\s*opt:\s*(\d+)\s*Z-score:\s*(\S+)\s*bits:\s*(\S+)\s*E\(\d*\):\s*(\S+)\s*$/) { # FASTA
140		$nali++;
141		$ali_target[$nali-1]   = $target;
142		$ali_score[$nali-1]    = $1;
143		$ali_bitscore[$nali-1] = $3;
144		$ali_evalue[$nali-1]   = $4;
145		$ali_hitidx[$nali-1]   = $hitidx;
146	    }
147	    elsif (/^Smith-Waterman score:\s+(\d+);\s+(\S+)% identity .* in (\d+) \S\S overlap \((\d+)-(\d+):(\d+)-(\d+)\)\s*/) {
148		$ali_identity[$nali-1]   = $2;
149		$ali_alen[$nali-1]       = $3;
150		$ali_qstart[$nali-1]     = $4;
151		$ali_qend[$nali-1]       = $5;
152		$ali_tstart[$nali-1]     = $6;
153		$ali_tend[$nali-1]       = $7;
154#		$ali_nident[$nali-1]     = $1;
155#		$ali_npos[$nali-1]       = $4;
156#		$ali_positive[$nali-1]   = $5;
157		$alilinecount            = 0;
158		$ali_qali[$nali-1]       = "";
159		$ali_tali[$nali-1]       = "";
160		$ali_qmask[$nali-1]      = "";
161		$ali_tmask[$nali-1]      = "";
162	    }
163	    elsif (/^(\S+)\s+(\S+)\s*$/) { # only ali lines are right-flushed
164		# the usual alignment display is
165		#       ali_query_line
166		#       mask
167		#       ali_target_line
168		#
169		# (1) ends are not flushed, and they can have "extra stuff"
170		#       function calculate_flushedmask() corrects that
171		#
172		# (2) alingments do not need to be complete. (particularly using option -a )
173		#     Meaning at the end AND at the beginning as well, you can end up with one single query line
174		#     or one single target line.
175		#
176		#     ali_query_line
177		#        (no mask)
178		#        (no ali_target_line)
179		#
180		# or
181		#
182		#         (no ali_query_line)
183		#         (no mask)
184		#     ali_target_line
185		#
186		# or even
187		#
188		#     aliq_query_line
189		#        (no mask)
190		#     ali_target_Line
191		#
192		# why? oh! why? check for that and fix it.
193
194		my $name = $1;
195		my $asq  = $2;
196
197		#carefull, the alignment part, truncates the names of the query and targets
198		# this is a problem specially if the query and target names start similarly.
199		# it appears that querynames have been truncated to 5 characters and targetnames to 6
200		# also check for a prvline with numbers, but if len < 10 those do not show up either
201		if ($queryname =~ /^$name/ && (length($name) <= 5 || $prvline =~ /\s+(\d+)\s*/)) {
202		    $prvaliline_isquery = 1;
203		    $ali_qline = $_; $ali_qline =~ s/\n//;
204		    $ali_qasq = $asq;
205		    $mask = "";
206		}
207		elsif ($ali_target[$nali-1] =~ /^$name/) {
208		    $talilinecount ++;
209		    $ali_tline = $_; $ali_tline =~ s/\n//;
210		    $ali_tasq = $asq;
211		    if ($prvaliline_isquery) {
212			$ali_qali[$nali-1]  .= $ali_qasq;
213			$ali_tali[$nali-1]  .= $ali_tasq;
214			$ali_qmask[$nali-1] .= calculate_flushedmask($ali_qline, $mask);
215			$ali_tmask[$nali-1] .= calculate_flushedmask($ali_tline, $mask);
216		    }
217		    $prvaliline_isquery = 0;
218		}
219		$alilinecount++;
220	    }
221	    elsif (/^(\s*[\.\:][\s\.\:]*)$/) {
222		$mask .= $1;
223	    }
224
225	    elsif (/^\s+(\d+)>>>\s*(\S*)\s*(.*)\s*-\s*(\d+) \S\S$/) { # next query is starting. \S\S is "nt" or "aa"
226		$save_querycount = $1;
227		$save_queryname  = $2;
228		$save_querydesc  = $3;
229		$save_querylen   = $4;
230		if ($save_queryname eq "") { $save_queryname = "unnamed_query"; }
231		return 1;	# normal return. We've finished output for a query, and stored some info about the next one.
232	    }
233	}
234	$prvline = $line;
235    } # this closes the loop over lines in the input stream: at EOF.
236
237    if ($parsing_alilist) {
238	for (my $ali = 0; $ali < $nali; $ali ++) {
239	    # the ali lines come along with more residues that are not part of the alignment. Why? oh! why?. REMOVE
240	    # you cannot remove using ali_{q,t}start and $ali_{q,t}end because those are
241	    # relative to the full sequence. Need to do it by parsing also the line in between (what I call the "mask")
242	    # and finding the first and last ":" or "."
243	    $ali_qali[$ali] = ali_removeflanking($ali_qali[$ali], $ali_qmask[$ali], $ali_qstart[$ali], $ali_qend[$ali]);
244	    $ali_tali[$ali] = ali_removeflanking($ali_tali[$ali], $ali_tmask[$ali], $ali_tstart[$ali], $ali_tend[$ali]);
245	}
246	return 1;
247    }
248    else { return 0; }  # at EOF: normal return if we were in the alignment section.
249}
250
251
252
253sub exblxout {
254    my $ofh     = shift;
255    my $i;
256
257    for ($i = 0; $i <= $nali-1; $i++) {
258	printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\n",
259	$ali_evalue[$i],
260	$ali_identity[$i],
261	$ali_tstart[$i],
262	$ali_tend[$i],
263	$ali_target[$i],
264	$ali_qstart[$i],
265	$ali_qend[$i],
266	$queryname;
267    }
268}
269
270sub tblout {
271    my $ofh     = shift;
272    my $i;
273
274    for ($i = 0; $i <= $nali-1; $i++) {
275	printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n",
276	$ali_evalue[$i],
277	$ali_identity[$i],
278	$ali_qstart[$i],
279	$ali_qend[$i],
280	$queryname,
281	$ali_tstart[$i],
282	$ali_tend[$i],
283	$ali_target[$i],
284	$target_desc{$ali_target[$i]};
285    }
286}
287
288
289sub gffout {
290    my $ofh     = shift;
291    my $source  = shift;
292    my $feature = shift;
293    my $i;
294    my $strand;
295
296    for ($i = 0; $i <= $nali-1; $i++) {
297	if ($ali_qstart[$i] > $ali_qend[$i]) { $strand = "-"; }
298	else { $strand = "+"; }
299
300	printf $ofh "%s\t%s\t%s\t%d\t%d\t%.1f\t%s\t.\tgene \"%s\"\n",
301	$ali_target[$i],
302	$source,
303	$feature,
304	$ali_tstart[$i],
305	$ali_tend[$i],
306	$ali_bitscore[$i],
307	$strand,
308	$queryname;
309    }
310}
311
312sub profmarkout {
313    my $ofh = shift;
314    my $i;
315
316    for ($i = 0; $i < $nhits; $i++) {
317	printf $ofh "%g\t%.1f\t%s\t%s\n", $hit_Eval[$i], $hit_bitscore[$i], $hit_target[$i], $queryname;
318    }
319}
320
321sub calculate_flushedmask {
322    my ($asq, $mask) = @_;
323
324    my $lremove = -1;
325    my $flushedasq;
326
327    if ($asq =~ /^(\S+\s+)(\S+)\s*$/) {
328	$lremove = length($1);
329	$flushedasq = $2;
330    }
331
332    my $flushedmask = $mask;
333    $flushedmask =~ s/^(.{$lremove})//;
334    $flushedmask =~ s/\n//;
335
336    if (length($flushedmask) > length($flushedasq)) {
337	while (length($flushedmask) > length($flushedasq)) {
338	    if ($flushedmask =~ /(\s)$/) { $flushedmask =~ s/(\s)$//; }
339	}
340    }
341    if (length($flushedmask) < length($flushedasq)) {
342	while (length($flushedmask) < length($flushedasq)) { $flushedmask .= " "; }
343    }
344
345    #printf("\naseq|$asq|\nmask|$mask|\n");
346    #printf("^^aseq|$flushedasq|\n^^mask|$flushedmask|\n");
347
348    return $flushedmask;
349}
350
351
352sub  ali_removeflanking {
353    my ($aseq, $mask) = @_;
354
355    my $taseq = "";
356
357    my $alen = length($aseq);
358
359    my $apos_start = 0;
360    while ($apos_start < $alen) {
361	my $mval = substr($mask, $apos_start, 1);
362	if ($mval  =~ /[\.\:]/) { last; }
363	$apos_start ++;
364    }
365
366    my $apos_end = $alen-1;
367    while ($apos_end >= 0) {
368	my $mval = substr($mask, $apos_end, 1);
369	if ($mval  =~ /[\.\:]/) { last; }
370	$apos_end --;
371    }
372
373    for (my $apos = $apos_start; $apos <= $apos_end; $apos++) {
374	$taseq .= substr($aseq, $apos, 1);
375    }
376    #print "B:$aseq\nM:$mask\nA:$taseq\n";
377
378    return $taseq;
379}
380
381
3821;
383__END__
384