1# -*-Perl-*- Test Harness script for Bioperl
2# $Id: SearchIO_blast.t 14995 2008-11-16 06:20:00Z cjfields $
3
4use strict;
5use warnings;
6
7BEGIN {
8    use Bio::Root::Test;
9
10    test_begin(-tests => 1389);
11
12    use_ok('Bio::SearchIO');
13}
14
15SKIP: {
16    test_skip(-tests => 4, -requires_module => 'Path::Class');
17
18
19    my $file = Path::Class::file(test_input_file('ecolitst.bls'));
20    my $f    = sub { my ($file) = @_; Bio::SearchIO->new( -file   => $file, -format => 'blast') };
21
22    lives_ok(sub { $f->($file) } , 'Bio::SearchIO->new can handle a Path::Class object');
23    isa_ok($f->($file), 'Bio::Root::IO');
24
25    $file = Path::Class::dir(File::Spec->catfile(qw/t data/))->file('ecolitst.bls');
26
27    lives_ok(sub { $f->($file) } , 'Bio::SearchIO->new can handle a Path::Class object');
28    isa_ok($f->($file), 'Bio::Root::IO');
29}
30
31my ( $searchio, $result, $iter, $hit, $hsp );
32
33$searchio = Bio::SearchIO->new(
34    '-format' => 'blast',
35    '-file'   => test_input_file('ecolitst.bls')
36);
37
38$result = $searchio->next_result;
39
40like($result->algorithm_reference,
41    qr/Gapped BLAST and PSI-BLAST: a new generation of protein database search/
42);
43
44is( $result->database_name, 'ecoli.aa', 'database_name()' );
45is( $result->database_entries, 4289 );
46is( $result->database_letters, 1358990 );
47
48is( $result->algorithm, 'BLASTP' );
49like( $result->algorithm_version, qr/^2\.1\.3/ );
50like( $result->query_name,
51qr/gi|1786183|gb|AAC73113.1| (AE000111) aspartokinase I,\s+homoserine dehydrogenase I [Escherichia coli]/
52);
53is( $result->query_accession,                 'AAC73113.1' );
54is( $result->query_gi,                        1786183 );
55is( $result->query_length,                    820 );
56is( $result->get_statistic('kappa'),          '0.135' );
57is( $result->get_statistic('kappa_gapped'),   '0.0410' );
58is( $result->get_statistic('lambda'),         '0.319' );
59is( $result->get_statistic('lambda_gapped'),  '0.267' );
60is( $result->get_statistic('entropy'),        '0.383' );
61is( $result->get_statistic('entropy_gapped'), '0.140' );
62
63is( $result->get_statistic('dbletters'),           1358990 );
64is( $result->get_statistic('dbentries'),           4289 );
65is( $result->get_statistic('effective_hsplength'), 47 );
66is( $result->get_statistic('effectivespace'),      894675611 );
67is( $result->get_parameter('matrix'),              'BLOSUM62' );
68is( $result->get_parameter('gapopen'),             11 );
69is( $result->get_parameter('gapext'),              1 );
70is( $result->get_statistic('S2'),                  '92' );
71is( $result->get_statistic('S2_bits'),             '40.0' );
72float_is( $result->get_parameter('expect'), '1.0e-03' );
73is( $result->get_statistic('num_extensions'),     '82424' );
74is( $result->get_statistic('querylength'),        773 );
75is( $result->get_statistic('effectivedblength'),  1157407 );
76is( $result->get_statistic('effectivespaceused'), 894675611 );
77
78my @valid = (
79    [ 'gb|AAC73113.1|', 820, 'AAC73113', '0',     1567, 4058 ],
80    [ 'gb|AAC76922.1|', 810, 'AAC76922', '1e-91', 332,  850 ],
81    [ 'gb|AAC76994.1|', 449, 'AAC76994', '3e-47', 184,  467 ]
82);
83my $count = 0;
84while ( $hit = $result->next_hit ) {
85    my $d = shift @valid;
86
87    is( $hit->name,      shift @$d );
88    is( $hit->length,    shift @$d );
89    is( $hit->accession, shift @$d );
90    float_is( $hit->significance, shift @$d );
91    is( $hit->bits,      shift @$d );
92    is( $hit->raw_score, shift @$d );
93
94    if ( $count == 0 ) {
95        my $hsps_left = 1;
96        while ( my $hsp = $hit->next_hsp ) {
97            is( $hsp->query->start,    1 );
98            is( $hsp->query->end,      820 );
99            is( $hsp->hit->start,      1 );
100            is( $hsp->hit->end,        820 );
101            is( $hsp->length('total'), 820 );
102            is( $hsp->start('hit'),    $hsp->hit->start );
103            is( $hsp->end('query'),    $hsp->query->end );
104            is( $hsp->strand('sbjct'), $hsp->subject->strand );  # alias for hit
105            float_is( $hsp->evalue, 0.0 );
106            is( $hsp->score, 4058 );
107            is( $hsp->bits,  1567 );
108            is( sprintf( "%.2f", $hsp->percent_identity ),        98.29 );
109            is( sprintf( "%.4f", $hsp->frac_identical('query') ), 0.9829 );
110            is( sprintf( "%.4f", $hsp->frac_identical('hit') ),   0.9829 );
111            is( $hsp->gaps, 0 );
112            is( $hsp->n,    1 );
113            $hsps_left--;
114        }
115        is( $hsps_left, 0 );
116    }
117    last if ( $count++ > @valid );
118}
119is( @valid, 0 );
120
121$searchio = Bio::SearchIO->new(
122    '-format' => 'blast',
123    '-file'   => test_input_file('ecolitst.wublastp')
124);
125
126$result = $searchio->next_result;
127
128like($result->algorithm_reference,
129     qr/Gish, W. \(1996-2000\)/);
130
131is( $result->database_name,    'ecoli.aa' );
132is( $result->database_letters, 1358990 );
133is( $result->database_entries, 4289 );
134is( $result->algorithm,        'BLASTP' );
135like( $result->algorithm_version, qr/^2\.0MP\-WashU/ );
136like( $result->query_name,
137qr/gi|1786183|gb|AAC73113.1| (AE000111) aspartokinase I,\s+homoserine dehydrogenase I [Escherichia coli]/
138);
139is( $result->query_accession, 'AAC73113.1' );
140
141is( $result->query_length,                          820 );
142is( $result->query_gi,                              1786183 );
143is( $result->get_statistic('kappa'),                0.136 );
144is( $result->get_statistic('lambda'),               0.319 );
145is( $result->get_statistic('entropy'),              0.384 );
146is( $result->get_statistic('dbletters'),            1358990 );
147is( $result->get_statistic('dbentries'),            4289 );
148is( $result->get_parameter('matrix'),               'BLOSUM62' );
149is( $result->get_statistic('Frame+0_lambda_used'),  '0.319' );
150is( $result->get_statistic('Frame+0_kappa_used'),   '0.136' );
151is( $result->get_statistic('Frame+0_entropy_used'), '0.384' );
152
153is( $result->get_statistic('Frame+0_lambda_computed'),  '0.319' );
154is( $result->get_statistic('Frame+0_kappa_computed'),   '0.136' );
155is( $result->get_statistic('Frame+0_entropy_computed'), '0.384' );
156
157is( $result->get_statistic('Frame+0_lambda_gapped'),  '0.244' );
158is( $result->get_statistic('Frame+0_kappa_gapped'),   '0.0300' );
159is( $result->get_statistic('Frame+0_entropy_gapped'), '0.180' );
160
161@valid = (
162    [ 'gb|AAC73113.1|', 820, 'AAC73113', '0',       4141 ],
163    [ 'gb|AAC76922.1|', 810, 'AAC76922', '3.1e-86', 844 ],
164    [ 'gb|AAC76994.1|', 449, 'AAC76994', '2.8e-47', 483 ]
165);
166$count = 0;
167while ( $hit = $result->next_hit ) {
168    my $d = shift @valid;
169
170    if ( $count == 1 ) {
171
172        # Test HSP contig data returned by SearchUtils::tile_hsps()
173        # Second hit has two hsps that overlap.
174
175        # compare with the contig made by hand for these two contigs
176        # in t/data/contig-by-hand.wublastp
177        # (in this made-up file, the hsps from ecolitst.wublastp
178        #  were aligned and contiged, and Length, Identities, Positives
179        #  were counted, by a human (maj) )
180
181        my $hand_hit = Bio::SearchIO->new(
182            -format => 'blast',
183            -file   => test_input_file('contig-by-hand.wublastp')
184        )->next_result->next_hit;
185        my $hand_hsp     = $hand_hit->next_hsp;
186        my @hand_qrng    = $hand_hsp->range('query');
187        my @hand_srng    = $hand_hsp->range('hit');
188        my @hand_matches = $hand_hit->matches;
189
190        my ( $qcontigs, $scontigs ) = Bio::Search::SearchUtils::tile_hsps($hit);
191
192        # Query contigs
193        is( $qcontigs->[0]->{'start'}, $hand_qrng[0] );
194        is( $qcontigs->[0]->{'stop'},  $hand_qrng[1] );
195        is( $qcontigs->[0]->{'iden'},  $hand_matches[0] );
196        is( $qcontigs->[0]->{'cons'},  $hand_matches[1] );
197
198        # Subject contigs
199        is( $scontigs->[0]->{'start'}, $hand_srng[0] );
200        is( $scontigs->[0]->{'stop'},  $hand_srng[1] );
201        is( $scontigs->[0]->{'iden'},  $hand_matches[0] );
202        is( $scontigs->[0]->{'cons'},  $hand_matches[1] );
203    }
204
205    is( $hit->name,      shift @$d );
206    is( $hit->length,    shift @$d );
207    is( $hit->accession, shift @$d );
208    float_is( $hit->significance, shift @$d );
209    is( $hit->raw_score, shift @$d );
210
211    if ( $count == 0 ) {
212        my $hsps_left = 1;
213        while ( my $hsp = $hit->next_hsp ) {
214            is( $hsp->query->start,    1 );
215            is( $hsp->query->end,      820 );
216            is( $hsp->hit->start,      1 );
217            is( $hsp->hit->end,        820 );
218            is( $hsp->length('total'), 820 );
219
220            float_is( $hsp->evalue, 0.0 );
221            float_is( $hsp->pvalue, '0.0' );
222            is( $hsp->score,                   4141 );
223            is( $hsp->bits,                    1462.8 );
224            is( $hsp->percent_identity,        100 );
225            is( $hsp->frac_identical('query'), 1.00 );
226            is( $hsp->frac_identical('hit'),   1.00 );
227            is( $hsp->gaps,                    0 );
228            is( $hsp->n,                       1 );
229            $hsps_left--;
230        }
231        is( $hsps_left, 0 );
232    }
233    last if ( $count++ > @valid );
234}
235is( @valid, 0 );
236
237# test that add hit really works properly for BLAST objects
238# bug 1611
239my @hits = $result->hits;
240$result->add_hit( $hits[0] );
241is( $result->num_hits, @hits + 1 );
242
243# test WU-BLAST -noseqs option
244$searchio = Bio::SearchIO->new(
245    '-format' => 'blast',
246    '-file'   => test_input_file('ecolitst.noseqs.wublastp')
247);
248
249$result = $searchio->next_result;
250is(
251    $result->algorithm_reference, 'Gish, W. (1996-2004) http://blast.wustl.edu
252'
253);
254is( $result->database_name,    'ecoli.aa' );
255is( $result->database_letters, 1358990 );
256is( $result->database_entries, 4289 );
257is( $result->algorithm,        'BLASTP' );
258like( $result->algorithm_version, qr/^2\.0MP\-WashU/ );
259like( $result->query_name,
260qr/gi|1786183|gb|AAC73113.1| (AE000111) aspartokinase I,\s+homoserine dehydrogenase I [Escherichia coli]/
261);
262is( $result->query_accession, 'AAC73113.1' );
263is( $result->query_gi,        1786183 );
264
265is( $result->query_length,                          820 );
266is( $result->get_statistic('kappa'),                0.135 );
267is( $result->get_statistic('lambda'),               0.319 );
268is( $result->get_statistic('entropy'),              0.384 );
269is( $result->get_statistic('dbletters'),            1358990 );
270is( $result->get_statistic('dbentries'),            4289 );
271is( $result->get_parameter('matrix'),               'BLOSUM62' );
272is( $result->get_statistic('Frame+0_lambda_used'),  '0.319' );
273is( $result->get_statistic('Frame+0_kappa_used'),   '0.135' );
274is( $result->get_statistic('Frame+0_entropy_used'), '0.384' );
275
276is( $result->get_statistic('Frame+0_lambda_computed'),  '0.319' );
277is( $result->get_statistic('Frame+0_kappa_computed'),   '0.135' );
278is( $result->get_statistic('Frame+0_entropy_computed'), '0.384' );
279
280is( $result->get_statistic('Frame+0_lambda_gapped'),  '0.244' );
281is( $result->get_statistic('Frame+0_kappa_gapped'),   '0.0300' );
282is( $result->get_statistic('Frame+0_entropy_gapped'), '0.180' );
283
284@valid = (
285    [ 'gb|AAC73113.1|', 820, 'AAC73113', '0',       4141 ],
286    [ 'gb|AAC76922.1|', 810, 'AAC76922', '6.6e-93', 907 ],
287    [ 'gb|AAC76994.1|', 449, 'AAC76994', '2.8e-47', 483 ]
288);
289$count = 0;
290while ( $hit = $result->next_hit ) {
291    my $d = shift @valid;
292
293    is( $hit->name,      shift @$d );
294    is( $hit->length,    shift @$d );
295    is( $hit->accession, shift @$d );
296    float_is( $hit->significance, shift @$d );
297    is( $hit->raw_score, shift @$d );
298
299    if ( $count == 0 ) {
300        my $hsps_left = 1;
301        while ( my $hsp = $hit->next_hsp ) {
302            is( $hsp->query->start,    1 );
303            is( $hsp->query->end,      820 );
304            is( $hsp->hit->start,      1 );
305            is( $hsp->hit->end,        820 );
306            is( $hsp->length('total'), 820 );
307
308            float_is( $hsp->evalue, 0. );
309            float_is( $hsp->pvalue, '0.' );
310            is( $hsp->score,                   4141 );
311            is( $hsp->bits,                    1462.8 );
312            is( $hsp->percent_identity,        100 );
313            is( $hsp->frac_identical('query'), 1.00 );
314            is( $hsp->frac_identical('hit'),   1.00 );
315            is( $hsp->gaps,                    0 );
316            is( $hsp->n,                       1 );
317            $hsps_left--;
318        }
319        is( $hsps_left, 0 );
320    }
321    last if ( $count++ > @valid );
322}
323is( @valid, 0 );
324
325# test tblastx
326$searchio = Bio::SearchIO->new(
327    '-format' => 'blast',
328    '-file'   => test_input_file('HUMBETGLOA.tblastx')
329);
330
331$result = $searchio->next_result;
332like($result->algorithm_reference,qr/Gapped BLAST and PSI-BLAST/);
333is( $result->database_name,    'ecoli.nt' );
334is( $result->database_letters, 4662239 );
335is( $result->database_entries, 400 );
336is( $result->algorithm,        'TBLASTX' );
337like( $result->algorithm_version, qr/^2\.1\.2/ );
338is( $result->query_name, 'HUMBETGLOA' );
339is( $result->query_description,
340    'Human haplotype C4 beta-globin gene, complete cds.' );
341is( $result->query_length,                        3002 );
342is( $result->get_statistic('kappa'),              0.135 );
343is( $result->get_statistic('lambda'),             0.318 );
344is( $result->get_statistic('entropy'),            0.401 );
345is( $result->get_statistic('dbletters'),          4662239 );
346is( $result->get_statistic('dbentries'),          400 );
347is( $result->get_statistic('querylength'),        953 );
348is( $result->get_statistic('effectivedblength'),  1535279 );
349is( $result->get_statistic('effectivespace'),     1463120887 );
350is( $result->get_statistic('effectivespaceused'), 1463120887 );
351is( $result->get_statistic('T'),                  13 );
352is( $result->get_statistic('X1'),                 16 );
353is( $result->get_statistic('X1_bits'),            7.3 );
354is( $result->get_statistic('X2'),                 0 );
355is( $result->get_statistic('X2_bits'),            '0.0' );
356is( $result->get_statistic('S1'),                 41 );
357is( $result->get_statistic('S1_bits'),            21.7 );
358is( $result->get_statistic('S2'),                 53 );
359is( $result->get_statistic('S2_bits'),            27.2 );
360
361is( $result->get_statistic('decayconst'), 0.1 );
362
363is( $result->get_parameter('matrix'), 'BLOSUM62' );
364
365@valid = (
366    [ 'gb|AE000479.1|AE000479', 10934, 'AE000479', '0.13', 33.6, 67 ],
367    [ 'gb|AE000302.1|AE000302', 10264, 'AE000302', '0.61', 31.3, 62 ],
368    [ 'gb|AE000277.1|AE000277', 11653, 'AE000277', '0.84', 30.8, 61 ]
369);
370$count = 0;
371
372while ( $hit = $result->next_hit ) {
373    my $d = shift @valid;
374    is( $hit->name,      shift @$d );
375    is( $hit->length,    shift @$d );
376    is( $hit->accession, shift @$d );
377    float_is( $hit->significance, shift @$d );
378    is( $hit->bits,      shift @$d );
379    is( $hit->raw_score, shift @$d );
380
381    if ( $count == 0 ) {
382        my $hsps_left = 1;
383        while ( my $hsp = $hit->next_hsp ) {
384            is( $hsp->query->start,    1057 );
385            is( $hsp->query->end,      1134 );
386            is( $hsp->query->strand,   1 );
387            is( $hsp->strand('query'), $hsp->query->strand );
388            is( $hsp->hit->end,        5893 );
389            is( $hsp->hit->start,      5816 );
390            is( $hsp->hit->strand,     -1 );
391            is( $hsp->strand('sbjct'), $hsp->subject->strand );
392            is( $hsp->length('total'), 26 );
393
394            float_is( $hsp->evalue, 0.13 );
395            is( $hsp->score, 67 );
396            is( $hsp->bits,  33.6 );
397            is( sprintf( "%.2f", $hsp->percent_identity ),        42.31 );
398            is( sprintf( "%.4f", $hsp->frac_identical('query') ), '0.4231' );
399            is( sprintf( "%.4f", $hsp->frac_identical('hit') ),   '0.4231' );
400            is( $hsp->query->frame(),  0 );
401            is( $hsp->hit->frame(),    1 );
402            is( $hsp->gaps,            0 );
403            is( $hsp->query_string,    'SAYWSIFPPLGCWWSTLGPRGSLSPL' );
404            is( $hsp->hit_string,      'AAVWALFPPVGSQWGCLASQWRTSPL' );
405            is( $hsp->homology_string, '+A W++FPP+G  W  L  +   SPL' );
406
407            # changed to reflect positional ambiguities, note extra flag
408            is(
409                join( ' ', $hsp->seq_inds( 'query', 'nomatch', 1 ) ),
410                '1063-1065 1090-1095 1099-1104 1108-1113 1117-1125'
411            );
412            is(
413                join( ' ', $hsp->seq_inds( 'hit', 'nomatch', 1 ) ),
414                '5825-5833 5837-5842 5846-5851 5855-5860 5885-5887'
415            );
416            is( $hsp->ambiguous_seq_inds, 'query/subject' );
417            is( $hsp->n,                  1 );
418            $hsps_left--;
419        }
420        is( $hsps_left, 0 );
421    }
422    last if ( $count++ > @valid );
423}
424is( @valid, 0 );
425
426# test for MarkW bug in blastN
427
428$searchio = Bio::SearchIO->new(
429    '-format' => 'blast',
430    '-file'   => test_input_file('a_thaliana.blastn')
431);
432
433$result = $searchio->next_result;
434like($result->algorithm_reference,qr/Gapped BLAST and PSI-BLAST/);
435is( $result->rid, '1012577175-3730-28291' );
436is( $result->database_name,
437'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS, GSS,or phase 0, 1 or 2 HTGS sequences) '
438);
439is( $result->database_letters, 4677375331 );
440is( $result->database_entries, 1083200 );
441is( $result->algorithm,        'BLASTN' );
442like( $result->algorithm_version, qr/^2\.2\.1/ );
443is( $result->query_name,                          '' );
444is( $result->query_length,                        60 );
445is( $result->get_parameter('gapopen'),            5 );
446is( $result->get_parameter('gapext'),             2 );
447is( $result->get_parameter('ktup'),               undef );
448is( $result->get_statistic('querylength'),        41 );
449is( $result->get_statistic('effectivedblength'),  4656794531 );
450is( $result->get_statistic('effectivespace'),     190928575771 );
451is( $result->get_statistic('effectivespaceused'), 190928575771 );
452
453is( $result->get_statistic('lambda'),  1.37 );
454is( $result->get_statistic('kappa'),   0.711 );
455is( $result->get_statistic('entropy'), 1.31 );
456is( $result->get_statistic('T'),       0 );
457is( $result->get_statistic('A'),       30 );
458is( $result->get_statistic('X1'),      '6' );
459is( $result->get_statistic('X1_bits'), 11.9 );
460is( $result->get_statistic('X2'),      15 );
461is( $result->get_statistic('X2_bits'), 29.7 );
462is( $result->get_statistic('S1'),      12 );
463is( $result->get_statistic('S1_bits'), 24.3 );
464is( $result->get_statistic('S2'),      17 );
465is( $result->get_statistic('S2_bits'), 34.2 );
466
467is( $result->get_statistic('dbentries'), 1083200 );
468
469@valid = (
470    [ 'gb|AY052359.1|', 2826, 'AY052359', '3e-18', 95.6, 48, 1, 60, '1.0000' ],
471    [
472        'gb|AC002329.2|AC002329', 76170, 'AC002329', '3e-18', 95.6, 48, 1, 60,
473        '1.0000'
474    ],
475    [
476        'gb|AF132318.1|AF132318', 5383, 'AF132318', '0.04', 42.1, 21, 35, 55,
477        '0.3500'
478    ]
479);
480$count = 0;
481
482while ( my $hit = $result->next_hit ) {
483    my $d = shift @valid;
484    is( $hit->name,      shift @$d );
485    is( $hit->length,    shift @$d );
486    is( $hit->accession, shift @$d );
487    float_is( $hit->significance, shift @$d );
488    is( $hit->bits,      shift @$d );
489    is( $hit->raw_score, shift @$d );
490    is( $hit->start,     shift @$d );
491    is( $hit->end,       shift @$d );
492    is( sprintf( "%.4f", $hit->frac_aligned_query ), shift @$d );
493
494    if ( $count == 0 ) {
495        my $hsps_left = 1;
496        while ( my $hsp = $hit->next_hsp ) {
497            is( $hsp->query->start,    1 );
498            is( $hsp->query->end,      60 );
499            is( $hsp->query->strand,   1 );
500            is( $hsp->hit->start,      154 );
501            is( $hsp->hit->end,        212 );
502            is( $hsp->hit->strand,     1 );
503            is( $hsp->length('total'), 60 );
504            float_is( $hsp->evalue, 3e-18 );
505            is( $hsp->score, 48 );
506            is( $hsp->bits,  95.6 );
507            is( sprintf( "%.2f", $hsp->percent_identity ),        96.67 );
508            is( sprintf( "%.4f", $hsp->frac_identical('query') ), 0.9667 );
509            is( sprintf( "%.4f", $hsp->frac_identical('hit') ),   0.9831 );
510            is( $hsp->query->frame(), 0 );
511            is( $hsp->hit->frame(),   0 );
512            is( $hsp->query->seq_id,  undef );
513            is( $hsp->hit->seq_id,    'gb|AY052359.1|' );
514            is( $hsp->gaps('query'),  0 );
515            is( $hsp->gaps('hit'),    1 );
516            is( $hsp->gaps,           1 );
517            is( $hsp->query_string,
518                'aggaatgctgtttaattggaatcgtacaatggagaatttgacggaaatagaatcaacgat'
519            );
520            is( $hsp->hit_string,
521                'aggaatgctgtttaattggaatca-acaatggagaatttgacggaaatagaatcaacgat'
522            );
523            is( $hsp->homology_string,
524                '|||||||||||||||||||||||  |||||||||||||||||||||||||||||||||||'
525            );
526            my $aln = $hsp->get_aln;
527            is( sprintf( "%.2f", $aln->overall_percentage_identity ), 96.67 );
528            is( sprintf( "%.2f", $aln->percentage_identity ),         98.31 );
529            is( $hsp->n, 1 );
530            $hsps_left--;
531        }
532        is( $hsps_left, 0 );
533    }
534    last if ( $count++ > @valid );
535}
536is( @valid, 0 );
537
538#WU-BlastX test
539
540$searchio = Bio::SearchIO->new(
541    '-format' => 'blast',
542    '-file'   => test_input_file('dnaEbsub_ecoli.wublastx')
543);
544
545$result = $searchio->next_result;
546is(
547    $result->algorithm_reference, 'Gish, W. (1996-2000) http://blast.wustl.edu
548Gish, Warren and David J. States (1993).  Identification of protein coding
549regions by database similarity search.  Nat. Genet. 3:266-72.
550'
551);
552is( $result->database_name,    'ecoli.aa' );
553is( $result->database_letters, 1358990 );
554is( $result->database_entries, 4289 );
555is( $result->algorithm,        'BLASTX' );
556like( $result->algorithm_version, qr/^2\.0MP\-WashU/ );
557is( $result->query_name, 'gi|142864|gb|M10040.1|BACDNAE' );
558is( $result->query_description,
559    'B.subtilis dnaE gene encoding DNA primase, complete cds' );
560is( $result->query_accession,         'M10040.1' );
561is( $result->query_gi,                142864 );
562is( $result->query_length,            2001 );
563is( $result->get_parameter('matrix'), 'blosum62' );
564
565is( $result->get_statistic('lambda'),  0.318 );
566is( $result->get_statistic('kappa'),   0.135 );
567is( $result->get_statistic('entropy'), 0.401 );
568
569is( $result->get_statistic('dbentries'), 4289 );
570
571@valid = ( [ 'gi|1789447|gb|AAC76102.1|', 581, 'AAC76102', '1.1e-74', 671 ] );
572$count = 0;
573
574while ( my $hit = $result->next_hit ) {
575    my $d = shift @valid;
576    is( $hit->name,      shift @$d );
577    is( $hit->length,    shift @$d );
578    is( $hit->accession, shift @$d );
579    float_is( $hit->significance, shift @$d );
580    is( $hit->raw_score, shift @$d );
581    is( sprintf( "%.4f", $hit->frac_identical('query') ), '0.3640' );
582    is( sprintf( "%.4f", $hit->frac_identical('hit') ),   '0.3660' );
583    is( sprintf( "%.4f", $hit->frac_conserved('query') ), '0.5370' );
584    is( sprintf( "%.4f", $hit->frac_conserved('hit') ),   '0.5400' );
585    is( sprintf( "%.4f", $hit->frac_aligned_query ),      '0.6200' );
586    is( sprintf( "%.4f", $hit->frac_aligned_hit ),        '0.7100' );
587
588    if ( $count == 0 ) {
589        my $hsps_left = 1;
590        while ( my $hsp = $hit->next_hsp ) {
591            is( $hsp->query->start,    21 );
592            is( $hsp->query->end,      1265 );
593            is( $hsp->query->strand,   1 );
594            is( $hsp->hit->start,      1 );
595            is( $hsp->hit->end,        413 );
596            is( $hsp->hit->strand,     0 );
597            is( $hsp->length('total'), 421 );
598            float_is( $hsp->evalue, 1.1e-74 );
599            float_is( $hsp->pvalue, '1.1e-74' );
600            is( $hsp->score, 671 );
601            is( $hsp->bits,  265.8 );
602            is( sprintf( "%.2f", $hsp->percent_identity ), 35.87 );
603
604            is( sprintf( "%.4f", $hsp->frac_identical('query') ), 0.3639 );
605            is( sprintf( "%.4f", $hsp->frac_identical('hit') ),   0.3656 );
606            is( sprintf( "%.4f", $hsp->frac_conserved('query') ), 0.5373 );
607            is( sprintf( "%.2f", $hsp->frac_conserved('hit') ),   0.54 );
608
609            is( sprintf( "%.4f", $hsp->frac_identical('hsp') ), 0.3587 );
610            is( sprintf( "%.4f", $hsp->frac_conserved('hsp') ), 0.5297 );
611
612            is( $hsp->query->frame(), 2 );
613            is( $hsp->hit->frame(),   0 );
614            is( $hsp->gaps('query'),  6 );
615            is( $hsp->gaps('hit'),    8 );
616            is( $hsp->gaps,           14 );
617            is( $hsp->query_string,
618'MGNRIPDEIVDQVQKSADIVEVIGDYVQLKKQGRNYFGLCPFHGESTPSFSVSPDKQIFHCFGCGAGGNVFSFLRQMEGYSFAESVSHLADKYQIDFPDDITVHSGARP---ESSGEQKMAEAHELLKKFYHHLLINTKEGQEALDYLLSRGFTKELINEFQIGYALDSWDFITKFLVKRGFSEAQMEKAGLLIRREDGSGYFDRFRNRVMFPIHDHHGAVVAFSGRALGSQQPKYMNSPETPLFHKSKLLYNFYKARLHIRKQERAVLFEGFADVYTAVSSDVKESIATMGTSLTDDHVKILRRNVEEIILCYDSDKAGYEATLKASELL---QKKGCKVRVAMIPDGLDPDDYIKKFGGEKFKNDIIDASVTVMAFKMQYFRKGKNLSDEGDRLAYIKDVLKEISTLSGSLEQEVYVKQ'
619            );
620            is( $hsp->hit_string,
621'MAGRIPRVFINDLLARTDIVDLIDARVKLKKQGKNFHACCPFHNEKTPSFTVNGEKQFYHCFGCGAHGNAIDFLMNYDKLEFVETVEELAAMHNLEVPFE----AGSGPSQIERHQRQTLYQLMDGLNTFYQQSL-QQPVATSARQYLEKRGLSHEVIARFAIGFAPPGWDNVLKRFGGNPENRQSLIDAGMLVTNDQGRSY-DRFRERVMFPIRDKRGRVIGFGGRVLGNDTPKYLNSPETDIFHKGRQLYGLYEAQQDNAEPNRLLVVEGYMDVVALAQYGINYAVASLGTSTTADHIQLLFRATNNVICCYDGDRAGRDAAWRALETALPYMTDGRQLRFMFLPDGEDPDTLVRKEGKEAFEARM-EQAMPLSAFLFNSLMPQVDLSTPDGRARLSTLALPLISQVPGETLR-IYLRQ'
622            );
623            is( $hsp->homology_string,
624'M  RIP   ++ +    DIV++I   V+LKKQG+N+   CPFH E TPSF+V+ +KQ +HCFGCGA GN   FL   +   F E+V  LA  + ++ P +    +G+ P   E    Q + +  + L  FY   L        A  YL  RG + E+I  F IG+A   WD + K       +   +  AG+L+  + G  Y DRFR RVMFPI D  G V+ F GR LG+  PKY+NSPET +FHK + LY  Y+A+    +  R ++ EG+ DV       +  ++A++GTS T DH+++L R    +I CYD D+AG +A  +A E        G ++R   +PDG DPD  ++K G E F+  + + ++ + AF         +LS    R       L  IS + G   + +Y++Q'
625            );
626            is(
627                join( ' ', $hsp->seq_inds( 'query', 'nomatch', 1 ) ),
628'24-29 39-47 54-56 60-71 90-98 129-137 150-152 156-158 180-182 192-194 219-221 228-236 243-251 255-263 267-269 279-284 291-296 300-302 309-311 315-317 321-332 342-344 351-362 366-368 372-374 378-383 387-389 393-398 405-413 417-440 444-449 456-461 468-470 474-476 486-491 495-497 510-518 525-527 531-533 537-557 561-569 573-578 594-599 603-605 609-614 618-620 633-635 654-656 660-665 669-671 678-680 684-686 693-695 705-710 738-740 753-755 759-761 768-773 786-797 801-806 810-812 819-821 831-833 840-860 864-869 894-896 900-902 921-923 927-938 945-947 957-959 972-974 981-986 993-995 999-1013 1017-1019 1029-1037 1050-1052 1062-1067 1077-1079 1083-1085 1089-1091 1098-1103 1107-1109 1113-1115 1122-1124 1128-1130 1137-1163 1173-1184 1188-1208 1212-1217 1224-1226 1230-1232 1236-1244 1248-1250'
629            );
630            is(
631                join( ' ', $hsp->seq_inds( 'query', 'mismatch', 1 ) ),
632'24-29 39-47 54-56 60-71 90-98 129-137 150-152 156-158 180-182 192-194 219-221 228-236 243-251 255-263 267-269 279-284 291-296 300-302 309-311 315-317 342-344 351-362 366-368 372-374 378-383 387-389 393-398 405-413 420-440 444-449 456-461 468-470 474-476 486-491 495-497 510-518 525-527 531-533 537-557 561-569 573-578 594-599 603-605 609-614 633-635 654-656 660-665 669-671 678-680 684-686 693-695 705-710 738-740 753-755 759-761 768-773 786-797 801-806 810-812 819-821 831-833 840-860 864-869 894-896 900-902 921-923 927-938 945-947 957-959 972-974 981-986 993-995 999-1013 1017-1019 1029-1037 1050-1052 1062-1067 1077-1079 1083-1085 1089-1091 1098-1103 1113-1115 1122-1124 1128-1130 1137-1163 1173-1184 1188-1208 1212-1217 1224-1226 1230-1232 1236-1244'
633            );
634            is(
635                join( ' ', $hsp->seq_inds( 'hit', 'nomatch', 1 ) ),
636'2 3 7-9 12 14-17 24-26 37-39 44 46 54 58 67 70-72 75-77 79-81 83 87 88 91 92 94 97 99 104 106-108 110-113 115 117 119 120 122 124 125 128-130 132-138 140 141 144 145 148 150 154 155 157 162-164 167 169 171-177 179-181 183 184 190 191 193 195 196 202 209 211 212 214 217 219 222 226 227 237 242 244 247 248 253-256 258 259 261 264 268 271-277 279 280 289 291 298 300-303 306 310 315 318 319 322 324-331 333 337-339 344 348 349 353 355 357 360 361 364 367 369 372-380 384-387 389-395 397 398 401 403 405-407'
637            );
638            is(
639                join( ' ', $hsp->seq_inds( 'hit', 'mismatch', 1 ) ),
640'2 3 7-9 12 14-17 24-26 37-39 44 46 54 58 67 70-72 75-77 79-81 83 87 88 91 92 94 97 99 104 110-113 115 117 119 120 122 124 125 128-130 132-138 140 141 144 145 148 150 154 155 157 162-164 167 169 171-177 179-181 183 184 190 191 193 195 196 202 209 211 212 214 217 219 222 226 227 237 242 244 247 248 253-256 258 259 261 264 268 271-277 279 280 289 291 298 300-303 306 310 315 318 319 322 324 325 329-331 333 337-339 344 348 349 353 355 357 360 361 364 367 369 372-380 384-387 389-395 397 398 401 403 405-407'
641            );
642            is( join( ' ', $hsp->seq_inds( 'query', 'gaps', 1 ) ), '347 1004' );
643            is( join( ' ', $hsp->seq_inds( 'hit', 'gaps', 1 ) ),
644                '100 131 197 362 408' );
645            is( $hsp->ambiguous_seq_inds, 'query' );
646            is( $hsp->n,                  1 );
647            $hsps_left--;
648        }
649        is( $hsps_left, 0 );
650    }
651    last if ( $count++ > @valid );
652}
653is( @valid, 0 );
654
655#Trickier WU-Blast
656$searchio = Bio::SearchIO->new(
657    '-format' => 'blast',
658    '-file'   => test_input_file('tricky.wublast')
659);
660$result = $searchio->next_result;
661my $hits_left = 1;
662while ( my $hit = $result->next_hit ) {
663
664# frac_aligned_hit used to be over 1, frac_identical & frac_conserved are still too wrong
665  TODO: {
666        local $TODO = 'frac_identical & frac_conserved are still too wrong';
667        cmp_ok sprintf( "%.3f", $hit->frac_identical ), '>',  0.9;
668        cmp_ok sprintf( "%.3f", $hit->frac_conserved ), '<=', 1;
669    }
670    is( sprintf( "%.2f", $hit->frac_aligned_query ), '0.92' );
671    is( sprintf( "%.2f", $hit->frac_aligned_hit ),   '0.91' );
672    $hits_left--;
673}
674is( $hits_left, 0 );
675
676# More frac_ method testing, this time on ncbi blastn
677$searchio = Bio::SearchIO->new(
678    '-format' => 'blast',
679    '-file'   => test_input_file('frac_problems.blast')
680);
681my @expected = ( "1.000", "0.943" );
682while ( my $result = $searchio->next_result ) {
683    my $hit = $result->next_hit;
684    is( $hit->frac_identical, shift @expected );
685}
686is( @expected, 0 );
687
688# And even more: frac_aligned_query should never be over 1!
689$searchio = Bio::SearchIO->new(
690    '-format' => 'blast',
691    '-file'   => test_input_file('frac_problems2.blast')
692);
693$result = $searchio->next_result;
694$hit    = $result->next_hit;
695is $hit->frac_aligned_query, 0.97;
696
697# Also, start and end should be sane
698$searchio = Bio::SearchIO->new(
699    '-format' => 'blast',
700    '-file'   => test_input_file('frac_problems3.blast')
701);
702$result = $searchio->next_result;
703$hit    = $result->next_hit;
704is $hit->start('sbjct'), 207;
705is $hit->end('sbjct'),   1051;
706
707#WU-TBlastN test
708
709$searchio = Bio::SearchIO->new(
710    '-format' => 'blast',
711    '-file'   => test_input_file('dnaEbsub_ecoli.wutblastn')
712);
713
714$result = $searchio->next_result;
715is(
716    $result->algorithm_reference, 'Gish, W. (1996-2000) http://blast.wustl.edu
717'
718);
719is( $result->database_name,    'ecoli.nt' );
720is( $result->database_letters, 4662239 );
721is( $result->database_entries, 400 );
722is( $result->algorithm,        'TBLASTN' );
723like( $result->algorithm_version, qr/^2\.0MP\-WashU/ );
724is( $result->query_name,              'gi|142865|gb|AAA22406.1|' );
725is( $result->query_description,       'DNA primase' );
726is( $result->query_accession,         'AAA22406.1' );
727is( $result->query_gi,                142865 );
728is( $result->query_length,            603 );
729is( $result->get_parameter('matrix'), 'blosum62' );
730
731is( $result->get_statistic('lambda'),  '0.320' );
732is( $result->get_statistic('kappa'),   0.136 );
733is( $result->get_statistic('entropy'), 0.387 );
734
735is( $result->get_statistic('dbentries'), 400 );
736
737@valid =
738  ( [ 'gi|1789441|gb|AE000388.1|AE000388', 10334, 'AE000388', '1.4e-73', 671 ]
739  );
740$count = 0;
741
742while ( my $hit = $result->next_hit ) {
743    my $d = shift @valid;
744    is( $hit->name,      shift @$d );
745    is( $hit->length,    shift @$d );
746    is( $hit->accession, shift @$d );
747    float_is( $hit->significance, shift @$d );
748    is( $hit->raw_score, shift @$d );
749
750    if ( $count == 0 ) {
751        my $hsps_left = 1;
752        while ( my $hsp = $hit->next_hsp ) {
753            is( $hsp->query->start,    1 );
754            is( $hsp->query->end,      415 );
755            is( $hsp->query->strand,   0 );
756            is( $hsp->hit->start,      4778 );
757            is( $hsp->hit->end,        6016 );
758            is( $hsp->hit->strand,     1 );
759            is( $hsp->length('total'), 421 );
760            float_is( $hsp->evalue, 1.4e-73 );
761            float_is( $hsp->pvalue, 1.4e-73 );
762            is( $hsp->score, 671 );
763            is( $hsp->bits,  265.8 );
764            is( sprintf( "%.2f", $hsp->percent_identity ),        35.87 );
765            is( sprintf( "%.4f", $hsp->frac_identical('hit') ),   0.3656 );
766            is( sprintf( "%.4f", $hsp->frac_identical('query') ), 0.3639 );
767            is( sprintf( "%.4f", $hsp->frac_conserved('hsp') ),   0.5297 );
768            is( $hsp->query->frame(), 0 );
769            is( $hsp->hit->frame(),   1 );
770            is( $hsp->gaps('query'),  6 );
771            is( $hsp->gaps('hit'),    8 );
772            is( $hsp->gaps,           14 );
773            is( $hsp->query_string,
774'MGNRIPDEIVDQVQKSADIVEVIGDYVQLKKQGRNYFGLCPFHGESTPSFSVSPDKQIFHCFGCGAGGNVFSFLRQMEGYSFAESVSHLADKYQIDFPDDITVHSGARP---ESSGEQKMAEAHELLKKFYHHLLINTKEGQEALDYLLSRGFTKELINEFQIGYALDSWDFITKFLVKRGFSEAQMEKAGLLIRREDGSGYFDRFRNRVMFPIHDHHGAVVAFSGRALGSQQPKYMNSPETPLFHKSKLLYNFYKARLHIRKQERAVLFEGFADVYTAVSSDVKESIATMGTSLTDDHVKILRRNVEEIILCYDSDKAGYEATLKASELL---QKKGCKVRVAMIPDGLDPDDYIKKFGGEKFKNDIIDASVTVMAFKMQYFRKGKNLSDEGDRLAYIKDVLKEISTLSGSLEQEVYVKQ'
775            );
776            is( $hsp->hit_string,
777'MAGRIPRVFINDLLARTDIVDLIDARVKLKKQGKNFHACCPFHNEKTPSFTVNGEKQFYHCFGCGAHGNAIDFLMNYDKLEFVETVEELAAMHNLEVPFE----AGSGPSQIERHQRQTLYQLMDGLNTFYQQSL-QQPVATSARQYLEKRGLSHEVIARFAIGFAPPGWDNVLKRFGGNPENRQSLIDAGMLVTNDQGRSY-DRFRERVMFPIRDKRGRVIGFGGRVLGNDTPKYLNSPETDIFHKGRQLYGLYEAQQDNAEPNRLLVVEGYMDVVALAQYGINYAVASLGTSTTADHIQLLFRATNNVICCYDGDRAGRDAAWRALETALPYMTDGRQLRFMFLPDGEDPDTLVRKEGKEAFEARM-EQAMPLSAFLFNSLMPQVDLSTPDGRARLSTLALPLISQVPGETLR-IYLRQ'
778            );
779            is( $hsp->homology_string,
780'M  RIP   ++ +    DIV++I   V+LKKQG+N+   CPFH E TPSF+V+ +KQ +HCFGCGA GN   FL   +   F E+V  LA  + ++ P +    +G+ P   E    Q + +  + L  FY   L        A  YL  RG + E+I  F IG+A   WD + K       +   +  AG+L+  + G  Y DRFR RVMFPI D  G V+ F GR LG+  PKY+NSPET +FHK + LY  Y+A+    +  R ++ EG+ DV       +  ++A++GTS T DH+++L R    +I CYD D+AG +A  +A E        G ++R   +PDG DPD  ++K G E F+  + + ++ + AF         +LS    R       L  IS + G   + +Y++Q'
781            );
782            is(
783                join( ' ', $hsp->seq_inds( 'query', 'nomatch', 1 ) ),
784'2 3 7-9 12 14-17 24-26 37-39 44 46 54 58 67 70-72 75-77 79-81 83 87 88 91 92 94 97 99 101-104 108 111-114 116 118 120 121 123 125 126 129-131 133-140 142 143 146 147 150 152 156 157 159 164-166 169 171 173-179 181-183 185 186 192 193 195 197 198 200 205 212 214 215 217 220 222 225 229 230 240 245 247 250 251 256-259 261 262 264 267 271 274-280 282 283 292 294 301 303-306 309 313 318 321 322 325 327-331 333 337-339 344 348 349 353 355 357 360 361 363 365 368 370 373-381 385-388 390-396 398 399 402 404 406-408 410'
785            );
786            is(
787                join( ' ', $hsp->seq_inds( 'hit', 'nomatch', 1 ) ),
788'4781-4786 4796-4804 4811-4813 4817-4828 4847-4855 4886-4894 4907-4909 4913-4915 4937-4939 4949-4951 4976-4978 4985-4993 5000-5008 5012-5020 5024-5026 5036-5041 5048-5053 5057-5059 5066-5068 5072-5074 5087-5089 5093-5101 5105-5116 5120-5122 5126-5128 5132-5137 5141-5143 5147-5152 5159-5167 5171-5191 5195-5200 5207-5212 5219-5221 5225-5227 5237-5242 5246-5248 5261-5269 5276-5278 5282-5284 5288-5308 5312-5320 5324-5329 5345-5350 5354-5356 5360-5365 5381-5383 5402-5404 5408-5413 5417-5419 5426-5428 5432-5434 5441-5443 5453-5458 5486-5488 5501-5503 5507-5509 5516-5521 5534-5545 5549-5554 5558-5560 5567-5569 5579-5581 5588-5608 5612-5617 5642-5644 5648-5650 5669-5671 5675-5686 5693-5695 5705-5707 5720-5722 5729-5734 5741-5743 5747-5770 5774-5776 5786-5794 5807-5809 5819-5824 5834-5836 5840-5842 5846-5848 5855-5860 5867-5869 5876-5878 5882-5884 5891-5917 5927-5938 5942-5962 5966-5971 5978-5980 5984-5986 5990-5998'
789            );
790            is( join( ' ', $hsp->seq_inds( 'query', 'gaps', 1 ) ), '109 328' );
791            is( join( ' ', $hsp->seq_inds( 'hit', 'gaps', 1 ) ),
792                '5077 5170 5368 5863 6001' );
793            is( $hsp->ambiguous_seq_inds, 'subject' );
794            is( $hsp->n,                  1 );
795            $hsps_left--;
796        }
797        is( $hsps_left, 0 );
798    }
799    last if ( $count++ > @valid );
800}
801is( $count, 1 );
802
803# WU-BLAST TBLASTX
804$searchio = Bio::SearchIO->new(
805    '-format' => 'blast',
806    '-file'   => test_input_file('dnaEbsub_ecoli.wutblastx')
807);
808
809$result = $searchio->next_result;
810is(
811    $result->algorithm_reference, 'Gish, W. (1996-2000) http://blast.wustl.edu
812'
813);
814is( $result->database_name,    'ecoli.nt' );
815is( $result->database_letters, 4662239 );
816is( $result->database_entries, 400 );
817is( $result->algorithm,        'TBLASTX' );
818like( $result->algorithm_version, qr/^2\.0MP\-WashU/ );
819is( $result->query_name, 'gi|142864|gb|M10040.1|BACDNAE' );
820is( $result->query_description,
821    'B.subtilis dnaE gene encoding DNA primase, complete cds' );
822is( $result->query_accession,         'M10040.1' );
823is( $result->query_gi,                142864 );
824is( $result->query_length,            2001 );
825is( $result->get_parameter('matrix'), 'blosum62' );
826
827is( $result->get_statistic('lambda'),    0.318 );
828is( $result->get_statistic('kappa'),     0.135 );
829is( $result->get_statistic('entropy'),   0.401 );
830is( $result->get_statistic('dbentries'), 400 );
831
832@valid = (
833    [
834        'gi|1789441|gb|AE000388.1|AE000388',
835        10334, 'AE000388', '6.4e-70', 318, 148.6
836    ],
837    [ 'gi|2367383|gb|AE000509.1|AE000509', 10589, 'AE000509', 1, 59, 29.9 ]
838);
839$count = 0;
840
841while ( my $hit = $result->next_hit ) {
842    my $d = shift @valid;
843    is( $hit->name,      shift @$d );
844    is( $hit->length,    shift @$d );
845    is( $hit->accession, shift @$d );
846
847    # using e here to deal with 0.9992 coming out right here as well
848    float_is( $hit->significance, shift @$d );
849    is( $hit->raw_score, shift @$d );
850    is( $hit->bits,      shift @$d );
851    if ( $count == 0 ) {
852        my $hspcounter = 0;
853        while ( my $hsp = $hit->next_hsp ) {
854            $hspcounter++;
855            if ( $hspcounter == 3 ) {
856
857                # let's actually look at the 3rd HSP
858                is( $hsp->query->start,    441 );
859                is( $hsp->query->end,      617 );
860                is( $hsp->query->strand,   1 );
861                is( $hsp->hit->start,      5192 );
862                is( $hsp->hit->end,        5368 );
863                is( $hsp->hit->strand,     1 );
864                is( $hsp->length('total'), 59 );
865                float_is( $hsp->evalue, 6.4e-70 );
866                float_is( $hsp->pvalue, 6.4e-70 );
867                is( $hsp->score, 85 );
868                is( $hsp->bits,  41.8 );
869                is( sprintf( "%.2f", $hsp->percent_identity ),        '32.20' );
870                is( sprintf( "%.3f", $hsp->frac_identical('hit') ),   0.322 );
871                is( sprintf( "%.3f", $hsp->frac_identical('query') ), 0.322 );
872                is( sprintf( "%.4f", $hsp->frac_conserved('hsp') ),   0.4746 );
873                is( $hsp->query->frame(), 2 );
874                is( $hsp->hit->frame(),   1 );
875                is( $hsp->gaps('query'),  0 );
876                is( $hsp->gaps('hit'),    0 );
877                is( $hsp->gaps,           0 );
878                is( $hsp->n,              1 );
879                is( $hsp->query_string,
880'ALDYLLSRGFTKELINEFQIGYALDSWDFITKFLVKRGFSEAQMEKAGLLIRREDGSGY'
881                );
882                is( $hsp->hit_string,
883'ARQYLEKRGLSHEVIARFAIGFAPPGWDNVLKRFGGNPENRQSLIDAGMLVTNDQGRSY'
884                );
885                is( $hsp->homology_string,
886'A  YL  RG + E+I  F IG+A   WD + K       +   +  AG+L+  + G  Y'
887                );
888                is(
889                    join( ' ', $hsp->seq_inds( 'query', 'nomatch', 1 ) ),
890'444-449 456-461 468-470 474-476 486-491 495-497 510-518 525-527 531-533 537-557 561-569 573-578 594-599 603-605 609-614'
891                );
892                is(
893                    join( ' ', $hsp->seq_inds( 'hit', 'nomatch', 1 ) ),
894'5195-5200 5207-5212 5219-5221 5225-5227 5237-5242 5246-5248 5261-5269 5276-5278 5282-5284 5288-5308 5312-5320 5324-5329 5345-5350 5354-5356 5360-5365'
895                );
896                is( $hsp->ambiguous_seq_inds, 'query/subject' );
897                last;
898            }
899        }
900        is( $hspcounter, 3 );
901    }
902    elsif ( $count == 1 ) {
903        my $hsps_to_do = 1;
904        while ( my $hsp = $hit->next_hsp ) {
905            is( $hsp->query->start,    587 );
906            is( $hsp->query->end,      706 );
907            is( $hsp->query->strand,   -1 );
908            is( $hsp->hit->start,      4108 );
909            is( $hsp->hit->end,        4227 );
910            is( $hsp->hit->strand,     -1 );
911            is( $hsp->length('total'), 40 );
912            float_is( $hsp->evalue, 7.1 );
913            float_is( $hsp->pvalue, '1.00' );
914            is( $hsp->score, 59 );
915            is( $hsp->bits,  29.9 );
916            is( sprintf( "%.2f", $hsp->percent_identity ),        '37.50' );
917            is( sprintf( "%.4f", $hsp->frac_identical('hit') ),   '0.3750' );
918            is( sprintf( "%.4f", $hsp->frac_identical('query') ), '0.3750' );
919            is( sprintf( "%.4f", $hsp->frac_conserved('hsp') ),   '0.4750' );
920            is( $hsp->query->frame(), 2 );
921            is( $hsp->hit->frame(),   2 );
922            is( $hsp->gaps('query'),  0 );
923            is( $hsp->gaps('hit'),    0 );
924            is( $hsp->gaps,           0 );
925            is( $hsp->n,              1 );
926            is( $hsp->query_string,
927                'WLPRALPEKATTAP**SWIGNMTRFLKRSKYPLPSSRLIR' );
928            is( $hsp->hit_string, 'WLSRTTVGSSTVSPRTFWITRMKVKLSSSKVTLPSTKSTR' );
929            is( $hsp->homology_string,
930                'WL R     +T +P   WI  M   L  SK  LPS++  R' );
931            $hsps_to_do--;
932            last;
933        }
934        is( $hsps_to_do, 0 );
935    }
936    last if ( $count++ > @valid );
937}
938is( $count, 2 );
939
940# WU-BLAST -echofilter option test (Bug 2388)
941$searchio = Bio::SearchIO->new(
942    '-format' => 'blast',
943    '-file'   => test_input_file('echofilter.wublastn')
944);
945
946$result = $searchio->next_result;
947is(
948    $result->algorithm_reference, 'Gish, W. (1996-2006) http://blast.wustl.edu
949'
950);
951is( $result->database_name,    'NM_003201.fa' );
952is( $result->database_letters, 1936 );
953is( $result->database_entries, 1 );
954is( $result->algorithm,        'BLASTN' );
955like( $result->algorithm_version, qr/^2\.0MP\-WashU/ );
956like( $result->query_name,
957qr/ref|NM_003201.1| Homo sapiens transcription factor A, mitochondrial \(TFAM\), mRNA/
958);
959is( $result->query_accession, 'NM_003201.1' );
960
961is( $result->query_length,               1936 );
962is( $result->get_statistic('lambda'),    0.192 );
963is( $result->get_statistic('kappa'),     0.182 );
964is( $result->get_statistic('entropy'),   0.357 );
965is( $result->get_statistic('dbletters'), 1936 );
966is( $result->get_statistic('dbentries'), 1 );
967is( $result->get_parameter('matrix'),    '+5,-4' );
968
969@valid = ( [ 'ref|NM_003201.1|', 1936, 'NM_003201', '0', 9680 ], );
970$count = 0;
971while ( $hit = $result->next_hit ) {
972    my $d = shift @valid;
973
974    is( $hit->name,      shift @$d );
975    is( $hit->length,    shift @$d );
976    is( $hit->accession, shift @$d );
977    float_is( $hit->significance, shift @$d );
978    is( $hit->raw_score, shift @$d );
979
980    if ( $count == 0 ) {
981        my $hsps_left = 1;
982        while ( my $hsp = $hit->next_hsp ) {
983            is( $hsp->query->start,    1 );
984            is( $hsp->query->end,      1936 );
985            is( $hsp->hit->start,      1 );
986            is( $hsp->hit->end,        1936 );
987            is( $hsp->length('total'), 1936 );
988
989            float_is( $hsp->evalue, 0. );
990            float_is( $hsp->pvalue, '0.' );
991            is( $hsp->score,                   9680 );
992            is( $hsp->bits,                    1458.4 );
993            is( $hsp->percent_identity,        100 );
994            is( $hsp->frac_identical('query'), 1.00 );
995            is( $hsp->frac_identical('hit'),   1.00 );
996            is( $hsp->gaps,                    0 );
997            is( $hsp->n,                       1 );
998            $hsps_left--;
999        }
1000        is( $hsps_left, 0 );
1001    }
1002    last if ( $count++ > @valid );
1003}
1004is( @valid, 0 );
1005
1006# Do a multiblast report test
1007$searchio = Bio::SearchIO->new(
1008    '-format' => 'blast',
1009    '-file'   => test_input_file('multi_blast.bls')
1010);
1011
1012@expected = qw(CATH_RAT CATL_HUMAN CATL_RAT PAPA_CARPA);
1013my $results_left = 4;
1014while ( my $result = $searchio->next_result ) {
1015    like($result->algorithm_reference, qr/Gapped BLAST and PSI-BLAST/);
1016    is( $result->query_name, shift @expected, "Multiblast query test" );
1017    $results_left--;
1018}
1019is( $results_left, 0 );
1020
1021# Test GCGBlast parsing
1022
1023$searchio = Bio::SearchIO->new(
1024    '-format' => 'blast',
1025    '-file'   => test_input_file('test.gcgblast')
1026);
1027$result = $searchio->next_result();
1028like($result->algorithm_reference,qr/Gapped BLAST and PSI-BLAST/);
1029is( $result->query_name,                   '/v0/people/staji002/test.gcg' );
1030is( $result->algorithm,                    'BLASTP' );
1031is( $result->algorithm_version,            '2.2.1 [Apr-13-2001]' );
1032is( $result->database_name,                'pir' );
1033is( $result->database_entries,             274514 );
1034is( $result->database_letters,             93460074 );
1035is( $result->get_statistic('querylength'), 44 );
1036is( $result->get_statistic('effectivedblength'),  65459646 );
1037is( $result->get_statistic('effectivespace'),     2880224424 );
1038is( $result->get_statistic('effectivespaceused'), 2880224424 );
1039
1040$hit = $result->next_hit;
1041is( $hit->description, 'F22B7.10 protein - Caenorhabditis elegans' );
1042is( $hit->name,        'PIR2:S44629' );
1043is( $hit->length,      628 );
1044is( $hit->accession,   'PIR2:S44629' );
1045float_is( $hit->significance, 2e-08 );
1046is( $hit->raw_score, 136 );
1047is( $hit->bits,      '57.0' );
1048$hsp = $hit->next_hsp;
1049float_is( $hsp->evalue, 2e-08 );
1050is( $hsp->bits,                    '57.0' );
1051is( $hsp->score,                   136 );
1052is( int( $hsp->percent_identity ), 28 );
1053is( sprintf( "%.2f", $hsp->frac_identical('query') ), 0.29 );
1054is( $hsp->frac_conserved('total'), 69 / 135 );
1055is( $hsp->gaps('total'),           8 );
1056is( $hsp->gaps('hit'),             6 );
1057is( $hsp->gaps('query'),           2 );
1058
1059is( $hsp->hit->start,   342 );
1060is( $hsp->hit->end,     470 );
1061is( $hsp->query->start, 3 );
1062is( $hsp->query->end,   135 );
1063
1064is( $hsp->query_string,
1065'CAAEFDFMEKETPLRYTKTXXXXXXXXXXXXXXRKIISDMWGVLAKQQTHVRKHQFDHGELVYHALQLLAYTALGILIMRLKLFLTPYMCVMASLICSRQLFGW--LFCKVHPGAIVFVILAAMSIQGSANLQTQ'
1066);
1067is( $hsp->hit_string,
1068'CSAEFDFIQYSTIEKLCGTLLIPLALISLVTFVFNFVKNT-NLLWRNSEEIG----ENGEILYNVVQLCCSTVMAFLIMRLKLFMTPHLCIVAALFANSKLLGGDRISKTIRVSALVGVI-AILFYRGIPNIRQQ'
1069);
1070is( $hsp->homology_string,
1071'C+AEFDF++  T  +   T                 + +   +L +    +     ++GE++Y+ +QL   T +  LIMRLKLF+TP++C++A+L  + +L G   +   +   A+V VI A +  +G  N++ Q'
1072);
1073
1074#test all the database accession number formats
1075$searchio = Bio::SearchIO->new(
1076    -format => 'blast',
1077    -file   => test_input_file('testdbaccnums.out')
1078);
1079$result = $searchio->next_result;
1080like($result->algorithm_reference,qr/Gapped BLAST and PSI-BLAST/);
1081is( $result->rid,                                 '1036160600-011802-21377' );
1082is( $result->get_statistic('querylength'),        9 );
1083is( $result->get_statistic('effectivedblength'),  35444647 );
1084is( $result->get_statistic('effectivespace'),     319001823 );
1085is( $result->get_statistic('effectivespaceused'), 319001823 );
1086
1087@valid = (
1088    [ 'pir||T14789',           'T14789',    'T14789', 'CAB53709', 'AAH01726' ],
1089    [ 'gb|NP_065733.1|CYT19',  'NP_065733', 'CYT19' ],
1090    [ 'emb|XP_053690.4|Cyt19', 'XP_053690' ],
1091    [ 'dbj|NP_056277.2|DKFZP586L0724',      'NP_056277' ],
1092    [ 'prf||XP_064862.2',                   'XP_064862' ],
1093    [ 'pdb|BAB13968.1|1',                   'BAB13968' ],
1094    [ 'sp|Q16478|GLK5_HUMAN',               'Q16478' ],
1095    [ 'pat|US|NP_002079.2',                 'NP_002079' ],
1096    [ 'bbs|NP_079463.2|',                   'NP_079463' ],
1097    [ 'gnl|db1|NP_002444.1',                'NP_002444' ],
1098    [ 'ref|XP_051877.1|',                   'XP_051877' ],
1099    [ 'lcl|AAH16829.1|',                    'AAH16829' ],
1100    [ 'gi|1|gb|NP_065733.1|CYT19',          'NP_065733' ],
1101    [ 'gi|2|emb|XP_053690.4|Cyt19',         'XP_053690' ],
1102    [ 'gi|3|dbj|NP_056277.2|DKFZP586L0724', 'NP_056277' ],
1103    [ 'gi|4|pir||T14789',                   'T14789' ],
1104    [ 'gi|5|prf||XP_064862.2',              'XP_064862' ],
1105    [ 'gi|6|pdb|BAB13968.1|1',              'BAB13968' ],
1106    [ 'gi|7|sp|Q16478|GLK5_HUMAN',          'Q16478' ],
1107    [ 'gi|8|pat|US|NP_002079.2',            'NP_002079' ],
1108    [ 'gi|9|bbs|NP_079463.2|',              'NP_079463' ],
1109    [ 'gi|10|gnl|db1|NP_002444.1',          'NP_002444' ],
1110    [ 'gi|11|ref|XP_051877.1|',             'XP_051877' ],
1111    [ 'gi|12|lcl|AAH16829.1|',              'AAH16829' ],
1112    [ 'MY_test_ID',                         'MY_test_ID' ]
1113);
1114
1115$hit = $result->next_hit;
1116my $d = shift @valid;
1117is( $hit->name,      shift @$d );
1118is( $hit->accession, shift @$d );
1119my @accnums = $hit->each_accession_number;
1120foreach my $a (@accnums) {
1121    is( $a, shift @$d );
1122}
1123$d   = shift @valid;
1124$hit = $result->next_hit;
1125is( $hit->name,      shift @$d );
1126is( $hit->accession, shift @$d );
1127is( $hit->locus,     shift @$d );
1128
1129$hits_left = 23;
1130while ( $hit = $result->next_hit ) {
1131    my $d = shift @valid;
1132    is( $hit->name,      shift @$d );
1133    is( $hit->accession, shift @$d );
1134    $hits_left--;
1135}
1136is( $hits_left, 0 );
1137
1138# Parse MEGABLAST
1139
1140# parse the BLAST-like output
1141my $infile = test_input_file('503384.MEGABLAST.2');
1142my $in     = Bio::SearchIO->new(
1143    -file   => $infile,
1144    -format => 'blast'
1145);    # this is megablast blast-like output
1146my $r        = $in->next_result;
1147my @dcompare = (
1148    [
1149        'Contig3700', 5631, 396, 785, '0.0', 785, '0.0', 396, 639, 12, 8723,
1150        9434, 1, 4083, 4794, -1
1151    ],
1152    [
1153        'Contig3997', 12734, 335, 664, '0.0', 664, '0.0', 335, 401, 0, 1282,
1154        1704, 1, 1546, 1968, -1
1155    ],
1156    [
1157        'Contig634', 858, 245, 486, '1e-136', 486,
1158        '1e-136',    245, 304, 3,   7620,     7941,
1159        1,           1,   321, -1
1160    ],
1161    [
1162        'Contig1853', 2314, 171,  339, '1e-91', 339,
1163        '1e-91',      171,  204,  0,   6406,    6620,
1164        1,            1691, 1905, 1
1165    ]
1166);
1167
1168like($r->algorithm_reference,qr/A greedy algorithm for aligning DNA sequences/);
1169is( $r->algorithm,                           'MEGABLAST' );
1170is( $r->query_name,                          '503384' );
1171is( $r->query_description,                   '11337 bp 2 contigs' );
1172is( $r->query_length,                        11337 );
1173is( $r->database_name,                       'cneoA.nt' );
1174is( $r->database_letters,                    17206226 );
1175is( $r->database_entries,                    4935 );
1176is( $r->get_statistic('querylength'),        11318 );
1177is( $r->get_statistic('effectivedblength'),  17112461 );
1178is( $r->get_statistic('effectivespace'),     193678833598 );
1179is( $r->get_statistic('effectivespaceused'), 0 );
1180
1181$hits_left = 4;
1182while ( my $hit = $r->next_hit ) {
1183    my $d = shift @dcompare;
1184    is( $hit->name,      shift @$d );
1185    is( $hit->length,    shift @$d );
1186    is( $hit->raw_score, shift @$d );
1187    is( $hit->bits,      shift @$d );
1188    float_is( $hit->significance, shift @$d );
1189
1190    my $hsp = $hit->next_hsp;
1191    is( $hsp->bits, shift @$d );
1192    float_is( $hsp->evalue, shift @$d );
1193    is( $hsp->score,         shift @$d );
1194    is( $hsp->num_identical, shift @$d );
1195    is( $hsp->gaps('total'), shift @$d );
1196    is( $hsp->query->start,  shift @$d );
1197    is( $hsp->query->end,    shift @$d );
1198    is( $hsp->query->strand, shift @$d );
1199    is( $hsp->hit->start,    shift @$d );
1200    is( $hsp->hit->end,      shift @$d );
1201    is( $hsp->hit->strand,   shift @$d );
1202    is( $hsp->n,             1 );
1203    $hits_left--;
1204}
1205is( $hits_left, 0 );
1206
1207# Let's test RPS-BLAST
1208
1209my $parser = Bio::SearchIO->new(
1210    -format => 'blast',
1211    -file   => test_input_file('ecoli_domains.rpsblast')
1212);
1213
1214$r = $parser->next_result;
1215is( $r->algorithm,                           'RPS-BLAST(BLASTP)');
1216is( $r->algorithm_version,                   '2.2.4 [Aug-26-2002]');
1217is( $r->algorithm_reference,                 undef );
1218is( $r->query_name,                          'gi|1786183|gb|AAC73113.1|' );
1219is( $r->query_gi,                            1786183 );
1220is( $r->num_hits,                            7 );
1221is( $r->get_statistic('querylength'),        438 );
1222is( $r->get_statistic('effectivedblength'),  31988 );
1223is( $r->get_statistic('effectivespace'),     14010744 );
1224is( $r->get_statistic('effectivespaceused'), 24054976 );
1225$hit = $r->next_hit;
1226is( $hit->name, 'gnl|CDD|3919' );
1227float_is( $hit->significance, 0.064 );
1228is( $hit->bits,      28.3 );
1229is( $hit->raw_score, 63 );
1230$hsp = $hit->next_hsp;
1231is( $hsp->query->start, 599 );
1232is( $hsp->query->end,   655 );
1233is( $hsp->hit->start,   23 );
1234is( $hsp->hit->end,     76 );
1235
1236# Test PSI-BLAST parsing
1237
1238$searchio = Bio::SearchIO->new(
1239    '-format' => 'blast',
1240    '-file'   => test_input_file('psiblastreport.out')
1241);
1242
1243$result = $searchio->next_result;
1244like($result->algorithm_reference, qr/Gapped BLAST and PSI-BLAST/);
1245is( $result->database_name,    '/home/peter/blast/data/swissprot.pr' );
1246is( $result->database_entries, 88780 );
1247is( $result->database_letters, 31984247 );
1248
1249is( $result->algorithm, 'BLASTP' );
1250like( $result->algorithm_version, qr/^2\.0\.14/ );
1251is( $result->query_name,             'CYS1_DICDI' );
1252is( $result->query_length,           343 );
1253is( $result->get_statistic('kappa'), 0.0491 );
1254cmp_ok( $result->get_statistic('lambda'),  '==', 0.270 );
1255cmp_ok( $result->get_statistic('entropy'), '==', 0.230 );
1256is( $result->get_statistic('dbletters'),           31984247 );
1257is( $result->get_statistic('dbentries'),           88780 );
1258is( $result->get_statistic('effective_hsplength'), 49 );
1259is( $result->get_statistic('querylength'),         294 );
1260is( $result->get_statistic('effectivedblength'),   27634027 );
1261is( $result->get_statistic('effectivespace'),      8124403938 );
1262is( $result->get_statistic('effectivespaceused'),  8124403938 );
1263is( $result->get_parameter('matrix'),              'BLOSUM62' );
1264is( $result->get_parameter('gapopen'),             11 );
1265is( $result->get_parameter('gapext'),              1 );
1266
1267my @valid_hit_data = (
1268    [ 'sp|P04988|CYS1_DICDI', 343, 'P04988', '0',     721 ],
1269    [ 'sp|P43295|A494_ARATH', 313, 'P43295', '1e-75', 281 ],
1270    [ 'sp|P25804|CYSP_PEA',   363, 'P25804', '1e-74', 278 ]
1271);
1272my @valid_iter_data = (
1273    [ 127, 127, 0,   109, 18, 0, 0,   0, 0 ],
1274    [ 157, 40,  117, 2,   38, 0, 109, 3, 5 ]
1275);
1276my $iter_count = 0;
1277
1278while ( $iter = $result->next_iteration ) {
1279    $iter_count++;
1280    my $di = shift @valid_iter_data;
1281    is( $iter->number, $iter_count );
1282
1283    is( $iter->num_hits,                                shift @$di );
1284    is( $iter->num_hits_new,                            shift @$di );
1285    is( $iter->num_hits_old,                            shift @$di );
1286    is( scalar( $iter->newhits_below_threshold ),       shift @$di );
1287    is( scalar( $iter->newhits_not_below_threshold ),   shift @$di );
1288    is( scalar( $iter->newhits_unclassified ),          shift @$di );
1289    is( scalar( $iter->oldhits_below_threshold ),       shift @$di );
1290    is( scalar( $iter->oldhits_newly_below_threshold ), shift @$di );
1291    is( scalar( $iter->oldhits_not_below_threshold ),   shift @$di );
1292
1293    my $hit_count = 0;
1294    if ( $iter_count == 1 ) {
1295        while ( $hit = $result->next_hit ) {
1296            my $d = shift @valid_hit_data;
1297
1298            is( $hit->name,      shift @$d );
1299            is( $hit->length,    shift @$d );
1300            is( $hit->accession, shift @$d );
1301            float_is( $hit->significance, shift @$d );
1302            is( $hit->bits, shift @$d );
1303
1304            if ( $hit_count == 1 ) {
1305                my $hsps_left = 1;
1306                while ( my $hsp = $hit->next_hsp ) {
1307                    is( $hsp->query->start,    32 );
1308                    is( $hsp->query->end,      340 );
1309                    is( $hsp->hit->start,      3 );
1310                    is( $hsp->hit->end,        307 );
1311                    is( $hsp->length('total'), 316 );
1312                    is( $hsp->start('hit'),    $hsp->hit->start );
1313                    is( $hsp->end('query'),    $hsp->query->end );
1314                    is( $hsp->strand('sbjct'), $hsp->subject->strand )
1315                      ;    # alias for hit
1316                    float_is( $hsp->evalue, 1e-75 );
1317                    is( $hsp->score, 712 );
1318                    is( $hsp->bits,  281 );
1319                    is( sprintf( "%.1f", $hsp->percent_identity ), 46.5 );
1320                    is( sprintf( "%.4f", $hsp->frac_identical('query') ),
1321                        0.4757 );
1322                    is( sprintf( "%.3f", $hsp->frac_identical('hit') ), 0.482 );
1323                    is( $hsp->gaps, 18 );
1324                    is( $hsp->n,    1 );
1325                    $hsps_left--;
1326                }
1327                is( $hsps_left, 0 );
1328            }
1329            last if ( $hit_count++ > @valid_hit_data );
1330        }
1331    }
1332}
1333is( @valid_hit_data,  0 );
1334is( @valid_iter_data, 0 );
1335
1336# Test filtering
1337
1338$searchio = Bio::SearchIO->new(
1339    '-format' => 'blast',
1340    '-file'   => test_input_file('ecolitst.bls'),
1341    '-signif' => 1e-100
1342);
1343
1344@valid = qw(gb|AAC73113.1|);
1345$r     = $searchio->next_result;
1346
1347while ( my $hit = $r->next_hit ) {
1348    is( $hit->name, shift @valid );
1349}
1350
1351$searchio = Bio::SearchIO->new(
1352    '-format' => 'blast',
1353    '-file'   => test_input_file('ecolitst.bls'),
1354    '-score'  => 100
1355);
1356
1357@valid = qw(gb|AAC73113.1| gb|AAC76922.1| gb|AAC76994.1|);
1358$r     = $searchio->next_result;
1359
1360while ( my $hit = $r->next_hit ) {
1361    is( $hit->name, shift @valid );
1362}
1363is( @valid, 0 );
1364
1365$searchio = Bio::SearchIO->new(
1366    '-format' => 'blast',
1367    '-file'   => test_input_file('ecolitst.bls'),
1368    '-bits'   => 200
1369);
1370
1371@valid = qw(gb|AAC73113.1| gb|AAC76922.1|);
1372$r     = $searchio->next_result;
1373
1374while ( my $hit = $r->next_hit ) {
1375    is( $hit->name, shift @valid );
1376}
1377is( @valid, 0 );
1378
1379my $filt_func = sub {
1380    my $hit = shift;
1381    $hit->frac_identical('query') >= 0.31;
1382};
1383
1384$searchio = Bio::SearchIO->new(
1385    '-format'     => 'blast',
1386    '-file'       => test_input_file('ecolitst.bls'),
1387    '-hit_filter' => $filt_func
1388);
1389
1390@valid = qw(gb|AAC73113.1| gb|AAC76994.1|);
1391$r     = $searchio->next_result;
1392
1393while ( my $hit = $r->next_hit ) {
1394    is( $hit->name, shift @valid );
1395}
1396is( @valid, 0 );
1397
1398# bl2seq parsing testing
1399
1400# this is blastp bl2seq
1401$searchio = Bio::SearchIO->new(
1402    -format => 'blast',
1403    -file   => test_input_file('bl2seq.out')
1404);
1405$result = $searchio->next_result;
1406isa_ok( $result, 'Bio::Search::Result::ResultI' );
1407is( $result->query_name,                          '' );
1408is( $result->algorithm,                           'BLASTP' );
1409is( $result->algorithm_reference,                 undef );
1410is( $result->get_statistic('querylength'),        320 );
1411is( $result->get_statistic('effectivedblength'),  339 );
1412is( $result->get_statistic('effectivespace'),     108480 );
1413is( $result->get_statistic('effectivespaceused'), 108480 );
1414$hit = $result->next_hit;
1415is( $hit->name,   'ALEU_HORVU' );
1416is( $hit->length, 362 );
1417$hsp = $hit->next_hsp;
1418is( $hsp->score,                481 );
1419is( $hsp->bits,                 191 );
1420is( int $hsp->percent_identity, 34 );
1421float_is( $hsp->evalue, 2e-53 );
1422is( int( $hsp->frac_conserved * $hsp->length ), 167 );
1423is( $hsp->query->start,                         28 );
1424is( $hsp->query->end,                           343 );
1425is( $hsp->hit->start,                           60 );
1426is( $hsp->hit->end,                             360 );
1427is( $hsp->gaps,                                 27 );
1428
1429# this is blastn bl2seq
1430$searchio = Bio::SearchIO->new(
1431    -format => 'blast',
1432    -file   => test_input_file('bl2seq.blastn.rev')
1433);
1434$result = $searchio->next_result;
1435isa_ok( $result, 'Bio::Search::Result::ResultI' );
1436is( $result->query_name,                          '' );
1437is( $result->algorithm,                           'BLASTN' );
1438is( $result->algorithm_reference,                 undef );
1439is( $result->query_length,                        180 );
1440is( $result->get_statistic('querylength'),        174 );
1441is( $result->get_statistic('effectivedblength'),  173 );
1442is( $result->get_statistic('effectivespace'),     30102 );
1443is( $result->get_statistic('effectivespaceused'), 30102 );
1444$hit = $result->next_hit;
1445is( $hit->length, 179 );
1446is( $hit->name,   'human' );
1447$hsp = $hit->next_hsp;
1448is( $hsp->score,                27 );
1449is( $hsp->bits,                 '54.0' );
1450is( int $hsp->percent_identity, 88 );
1451float_is( $hsp->evalue, 2e-12 );
1452is( int( $hsp->frac_conserved * $hsp->length ), 83 );
1453is( $hsp->query->start,                         94 );
1454is( $hsp->query->end,                           180 );
1455is( $hsp->query->strand,                        1 );
1456is( $hsp->hit->strand,                          -1 );
1457is( $hsp->hit->start,                           1 );
1458is( $hsp->hit->end,                             94 );
1459is( $hsp->gaps,                                 7 );
1460is( $hsp->n,                                    1 );
1461
1462# this is blastn bl2seq
1463$searchio = Bio::SearchIO->new(
1464    -format => 'blast',
1465    -file   => test_input_file('bl2seq.blastn')
1466);
1467$result = $searchio->next_result;
1468isa_ok( $result, 'Bio::Search::Result::ResultI' );
1469is( $result->query_name,                          '' );
1470is( $result->query_length,                        180 );
1471is( $result->algorithm,                           'BLASTN' );
1472is( $result->algorithm_reference,                 undef );
1473is( $result->get_statistic('querylength'),        174 );
1474is( $result->get_statistic('effectivedblength'),  173 );
1475is( $result->get_statistic('effectivespace'),     30102 );
1476is( $result->get_statistic('effectivespaceused'), 30102 );
1477$hit = $result->next_hit;
1478is( $hit->name,   'human' );
1479is( $hit->length, 179 );
1480$hsp = $hit->next_hsp;
1481is( $hsp->score,                27 );
1482is( $hsp->bits,                 '54.0' );
1483is( int $hsp->percent_identity, 88 );
1484float_is( $hsp->evalue, 2e-12 );
1485is( int( $hsp->frac_conserved * $hsp->length ), 83 );
1486is( $hsp->query->start,                         94 );
1487is( $hsp->query->end,                           180 );
1488is( $hsp->query->strand,                        1 );
1489is( $hsp->hit->strand,                          1 );
1490is( $hsp->hit->start,                           86 );
1491is( $hsp->hit->end,                             179 );
1492is( $hsp->gaps,                                 7 );
1493is( $hsp->n,                                    1 );
1494
1495# this is blastn bl2seq+
1496$searchio = Bio::SearchIO->new(
1497    -format => 'blast',
1498    -file   => test_input_file('bl2seq+.blastn')
1499);
1500$result = $searchio->next_result;
1501isa_ok( $result, 'Bio::Search::Result::ResultI' );
1502is( $result->query_name, 'gi|2695846|emb|Y13255.1|' );
1503is( $result->query_description,
1504   'Acipenser baeri mRNA for immunoglobulin heavy chain, clone ScH 3.3'
1505);
1506is( $result->query_length,                        606 );
1507is( $result->algorithm,                          'BLASTN' );
1508is( $result->algorithm_version,                  '2.2.29+' );
1509is( $result->algorithm_reference,                 undef );
1510is( $result->get_statistic('effectivespaceused'), 352836 );
1511is( $result->get_statistic('kappa'),              0.621 );
1512is( $result->get_statistic('kappa_gapped'),      '0.460' );
1513is( $result->get_statistic('lambda'),             1.33 );
1514is( $result->get_statistic('lambda_gapped'),      1.28 );
1515is( $result->get_statistic('entropy'),            1.12 );
1516is( $result->get_statistic('entropy_gapped'),    '0.850' );
1517$hit = $result->next_hit;
1518is( $hit->name,   'gi|2695846|emb|Y13255.1|' );
1519is( $hit->description,
1520   'Acipenser baeri mRNA for immunoglobulin heavy chain, clone ScH 3.3'
1521);
1522is( $hit->length, 606 );
1523$hsp = $hit->next_hsp;
1524is( $hsp->score,            606 );
1525is( $hsp->bits,             1120 );
1526is( $hsp->percent_identity, 100 );
1527float_is( $hsp->evalue,    '0.0' );
1528is( $hsp->query->start,     1 );
1529is( $hsp->query->end,       606 );
1530is( $hsp->query->strand,    1 );
1531is( $hsp->hit->strand,      1 );
1532is( $hsp->hit->start,       1 );
1533is( $hsp->hit->end,         606 );
1534is( $hsp->gaps,             0 );
1535is( $hsp->n,                1 );
1536
1537# this is blastp bl2seq
1538$searchio = Bio::SearchIO->new(
1539    -format => 'blast',
1540    -file   => test_input_file('bl2seq.bug940.out')
1541);
1542$result = $searchio->next_result;
1543isa_ok( $result, 'Bio::Search::Result::ResultI' );
1544is( $result->query_name, 'zinc' );
1545is( $result->algorithm,  'BLASTP' );
1546is( $result->query_description,
1547'finger protein 135 (clone pHZ-17) [Homo sapiens]. neo_id RS.ctg14243-000000.6.0'
1548);
1549is( $result->query_length,                        469 );
1550is( $result->get_statistic('querylength'),        446 );
1551is( $result->get_statistic('effectivedblength'),  446 );
1552is( $result->get_statistic('effectivespace'),     198916 );
1553is( $result->get_statistic('effectivespaceused'), 198916 );
1554$hit = $result->next_hit;
1555is( $hit->name,    'gi|4507985|' );
1556is( $hit->ncbi_gi, 4507985 );
1557is( $hit->description,
1558'zinc finger protein 135 (clone pHZ-17) [Homo sapiens]. neo_id RS.ctg14243-000000.6.0'
1559);
1560is( $hit->length, 469 );
1561$hsp = $hit->next_hsp;
1562is( $hsp->score,                1626 );
1563is( $hsp->bits,                 637 );
1564is( int $hsp->percent_identity, 66 );
1565float_is( $hsp->evalue, 0.0 );
1566is( int( $hsp->frac_conserved * $hsp->length ), 330 );
1567is( $hsp->query->start,                         121 );
1568is( $hsp->query->end,                           469 );
1569is( $hsp->hit->start,                           1 );
1570is( $hsp->hit->end,                             469 );
1571is( $hsp->gaps,                                 120 );
1572is( $hsp->n,                                    1 );
1573
1574ok( $hit->next_hsp );    # there is more than one HSP here,
1575                         # make sure it is parsed at least
1576
1577# cannot distinguish between blastx and tblastn reports
1578# so we're only testing a blastx report for now
1579
1580# this is blastx bl2seq
1581$searchio = Bio::SearchIO->new(
1582    -format => 'blast',
1583    -file   => test_input_file('bl2seq.blastx.out')
1584);
1585$result = $searchio->next_result;
1586isa_ok( $result, 'Bio::Search::Result::ResultI' );
1587is( $result->query_name, 'AE000111.1' );
1588is( $result->query_description,
1589    'Escherichia coli K-12 MG1655 section 1 of 400 of the complete genome' );
1590is( $result->algorithm,                           'BLASTX' );
1591is( $result->algorithm_reference,                 undef );
1592is( $result->query_length,                        720 );
1593is( $result->get_statistic('querylength'),        undef );
1594is( $result->get_statistic('effectivedblength'),  787 );
1595is( $result->get_statistic('effectivespace'),     undef );
1596is( $result->get_statistic('effectivespaceused'), 162122 );
1597$hit = $result->next_hit;
1598is( $hit->name, 'AK1H_ECOLI' );
1599is( $hit->description,
1600'P00561 Bifunctional aspartokinase/homoserine dehydrogenase I (AKI-HDI) [Includes: Aspartokinase I ; Homoserine dehydrogenase I ]'
1601);
1602is( $hit->length, 820 );
1603$hsp = $hit->next_hsp;
1604is( $hsp->score,                634 );
1605is( $hsp->bits,                 248 );
1606is( int $hsp->percent_identity, 100 );
1607float_is( $hsp->evalue, 2e-70 );
1608is( int( $hsp->frac_conserved * $hsp->length ), 128 );
1609is( $hsp->query->start,                         1 );
1610is( $hsp->query->end,                           384 );
1611is( $hsp->hit->start,                           1 );
1612is( $hsp->hit->end,                             128 );
1613is( $hsp->gaps,                                 0 );
1614is( $hsp->query->frame,                         0 );
1615is( $hsp->hit->frame,                           0 );
1616is( $hsp->query->strand,                        -1 );
1617is( $hsp->hit->strand,                          0 );
1618is( $hsp->n,                                    1 );
1619
1620# this is tblastx bl2seq (self against self)
1621$searchio = Bio::SearchIO->new(
1622    -format => 'blast',
1623    -file   => test_input_file('bl2seq.tblastx.out')
1624);
1625$result = $searchio->next_result;
1626isa_ok( $result, 'Bio::Search::Result::ResultI' );
1627is( $result->query_name,          'Escherichia' );
1628is( $result->algorithm,           'TBLASTX' );
1629is( $result->algorithm_reference, undef );
1630is( $result->query_description,
1631    'coli K-12 MG1655 section 1 of 400 of the complete genome' );
1632is( $result->query_length,                        720 );
1633is( $result->get_statistic('querylength'),        undef );
1634is( $result->get_statistic('effectivedblength'),  221 );
1635is( $result->get_statistic('effectivespace'),     undef );
1636is( $result->get_statistic('effectivespaceused'), 48620 );
1637$hit = $result->next_hit;
1638is( $hit->name,    'gi|1786181|gb|AE000111.1|AE000111' );
1639is( $hit->ncbi_gi, 1786181 );
1640is( $hit->description,
1641    'Escherichia coli K-12 MG1655 section 1 of 400 of the complete genome' );
1642is( $hit->length, 720 );
1643$hsp = $hit->next_hsp;
1644is( $hsp->score,                1118 );
1645is( $hsp->bits,                 515 );
1646is( int $hsp->percent_identity, 95 );
1647float_is( $hsp->evalue, 1e-151 );
1648is( int( $hsp->frac_conserved * $hsp->length ), 229 );
1649is( $hsp->query->start,                         1 );
1650is( $hsp->query->end,                           720 );
1651is( $hsp->hit->start,                           1 );
1652is( $hsp->hit->end,                             720 );
1653is( $hsp->gaps,                                 0 );
1654is( $hsp->query->frame,                         0 );
1655is( $hsp->hit->frame,                           0 );
1656is( $hsp->query->strand,                        1 );
1657is( $hsp->hit->strand,                          1 );
1658is( $hsp->n,                                    1 );
1659
1660# this is NCBI tblastn
1661$searchio = Bio::SearchIO->new(
1662    -format => 'blast',
1663    -file   => test_input_file('tblastn.out')
1664);
1665$result = $searchio->next_result;
1666isa_ok( $result, 'Bio::Search::Result::ResultI' );
1667is( $result->algorithm, 'TBLASTN' );
1668like($result->algorithm_reference,qr/Gapped BLAST and PSI-BLAST/);
1669is( $result->get_statistic('querylength'),        102 );
1670is( $result->get_statistic('effectivedblength'),  4342 );
1671is( $result->get_statistic('effectivespace'),     442884 );
1672is( $result->get_statistic('effectivespaceused'), 442884 );
1673$hit = $result->next_hit;
1674is( $hit->name, 'gi|10040111|emb|AL390796.6|AL390796' );
1675
1676# Test Blast parsing with B=0 (WU-BLAST)
1677$searchio = Bio::SearchIO->new(
1678    -file   => test_input_file('no_hsps.blastp'),
1679    -format => 'blast'
1680);
1681$result = $searchio->next_result;
1682like($result->algorithm_reference,qr/Gish, W. \(1996-2003\)/);
1683is( $result->query_name, 'mgri:MG00189.3' );
1684$hit = $result->next_hit;
1685is( $hit->name,        'mgri:MG00189.3' );
1686is( $hit->description, 'hypothetical protein 6892 8867 +' );
1687is( $hit->bits,        3098 );
1688float_is( $hit->significance, 0. );
1689
1690$hit = $result->next_hit;
1691is( $hit->name,        'fgram:FG01141.1' );
1692is( $hit->description, 'hypothetical protein 47007 48803 -' );
1693is( $hit->bits,        2182 );
1694float_is( $hit->significance, 4.2e-226 );
1695is( $result->num_hits, 415 );
1696
1697# Let's now test if _guess_format is doing its job correctly
1698my %pair = (
1699    'filename.blast' => 'blast',
1700    'filename.bls'   => 'blast',
1701    'f.blx'          => 'blast',
1702    'f.tblx'         => 'blast',
1703    'fast.bls'       => 'blast',
1704    'f.fasta'        => 'fasta',
1705    'f.fa'           => 'fasta',
1706    'f.fx'           => 'fasta',
1707    'f.fy'           => 'fasta',
1708    'f.ssearch'      => 'fasta',
1709    'f.SSEARCH.m9'   => 'fasta',
1710    'f.m9'           => 'fasta',
1711    'f.psearch'      => 'fasta',
1712    'f.osearch'      => 'fasta',
1713    'f.exon'         => 'exonerate',
1714    'f.exonerate'    => 'exonerate',
1715    'f.blastxml'     => 'blastxml',
1716    'f.xml'          => 'blastxml'
1717);
1718while ( my ( $file, $expformat ) = each %pair ) {
1719    is( Bio::SearchIO->_guess_format($file),
1720        $expformat, "$expformat for $file" );
1721}
1722
1723# Test Wes Barris's reported bug when parsing blastcl3 output which
1724# has integer overflow
1725
1726$searchio = Bio::SearchIO->new(
1727    -file   => test_input_file('hsinsulin.blastcl3.blastn'),
1728    -format => 'blast'
1729);
1730$result = $searchio->next_result;
1731is( $result->query_name,         'human' );
1732is( $result->database_letters(), '-24016349' );
1733
1734# this is of course not the right length, but is the what blastcl3
1735# reports, the correct value is
1736is( $result->get_statistic('dbletters'), '192913178' );
1737is( $result->get_statistic('dbentries'), '1867771' );
1738
1739# test for links and groups being parsed out of WU-BLAST properly
1740$searchio = Bio::SearchIO->new(
1741    -format => 'blast',
1742    -file   => test_input_file('brassica_ATH.WUBLASTN')
1743);
1744ok( $result = $searchio->next_result );
1745ok( $hit    = $result->next_hit );
1746ok( $hsp    = $hit->next_hsp );
1747is( $hsp->links,         '(1)-3-2' );
1748is( $hsp->query->strand, 1 );
1749is( $hsp->hit->strand,   1 );
1750is( $hsp->hsp_group,     '1' );
1751is( $hsp->n,             1 );
1752## Web blast result parsing
1753
1754$searchio = Bio::SearchIO->new(
1755    -format => 'blast',
1756    -file   => test_input_file('catalase-webblast.BLASTP')
1757);
1758ok( $result = $searchio->next_result );
1759is( $result->rid, '1118324516-16598-103707467515.BLASTQ1' );
1760ok( $hit = $result->next_hit );
1761is( $hit->name,      'gi|40747822|gb|EAA66978.1|', 'full hit name' );
1762is( $hit->accession, 'EAA66978',                   'hit accession' );
1763is( $hit->ncbi_gi,   40747822 );
1764ok( $hsp = $hit->next_hsp );
1765is( $hsp->query->start, 1,   'query start' );
1766is( $hsp->query->end,   528, 'query start' );
1767is( $hsp->n,            1 );
1768
1769# tests for new BLAST 2.2.13 output
1770$searchio = Bio::SearchIO->new(
1771    -format => 'blast',
1772    -file   => test_input_file('new_blastn.txt')
1773);
1774
1775$result = $searchio->next_result;
1776is( $result->database_name,
1777'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences)'
1778);
1779is( $result->database_entries,  3742891 );
1780is( $result->database_letters,  16670205594 );
1781is( $result->algorithm,         'BLASTN' );
1782is( $result->algorithm_version, '2.2.13 [Nov-27-2005]' );
1783like($result->algorithm_reference, qr/Gapped BLAST and PSI-BLAST/);
1784is( $result->rid,                    '1141079027-8324-8848328247.BLASTQ4' );
1785is( $result->query_name,             'pyrR,' );
1786is( $result->query_length,           558 );
1787is( $result->get_statistic('kappa'), '0.711' );
1788is( $result->get_statistic('kappa_gapped'),        '0.711' );
1789is( $result->get_statistic('lambda'),              '1.37' );
1790is( $result->get_statistic('lambda_gapped'),       '1.37' );
1791is( $result->get_statistic('entropy'),             '1.31' );
1792is( $result->get_statistic('entropy_gapped'),      '1.31' );
1793is( $result->get_statistic('dbletters'),           '-509663586' );
1794is( $result->get_statistic('dbentries'),           3742891 );
1795is( $result->get_statistic('effective_hsplength'), undef );
1796is( $result->get_statistic('effectivespace'),      8935230198384 );
1797is(
1798    $result->get_statistic(
1799        'number_of_hsps_better_than_expect_value_cutoff_without_gapping'),
1800    0
1801);
1802is( $result->get_statistic('number_of_hsps_gapped'),              1771 );
1803is( $result->get_statistic('number_of_hsps_successfully_gapped'), 0 );
1804is( $result->get_statistic('length_adjustment'),                  22 );
1805is( $result->get_statistic('querylength'),                        536 );
1806is( $result->get_statistic('effectivedblength'),                  16670205594 );
1807is( $result->get_statistic('effectivespaceused'), 8891094027712 );
1808is( $result->get_parameter('matrix'),             'blastn matrix:1 -3' );
1809is( $result->get_parameter('gapopen'),            5 );
1810is( $result->get_parameter('gapext'),             2 );
1811is( $result->get_statistic('S2'),                 '60' );
1812is( $result->get_statistic('S2_bits'),            '119.4' );
1813float_is( $result->get_parameter('expect'), '1e-23' );
1814is( $result->get_statistic('num_extensions'), '117843' );
1815
1816@valid = (
1817    [
1818        'gi|41400296|gb|AE016958.1|', 4829781, 'AE016958', 41400296, '6e-059',
1819        119, 236
1820    ],
1821    [
1822        'gi|54013472|dbj|AP006618.1|', 6021225, 'AP006618', 54013472, '4e-026',
1823        64, 127
1824    ],
1825    [
1826        'gi|57546753|dbj|BA000030.2|', 9025608, 'BA000030', 57546753, '1e-023',
1827        60, 119
1828    ]
1829);
1830$count = 0;
1831
1832while ( $hit = $result->next_hit ) {
1833    my $d = shift @valid;
1834
1835    is( $hit->name,      shift @$d );
1836    is( $hit->length,    shift @$d );
1837    is( $hit->accession, shift @$d );
1838    is( $hit->ncbi_gi,   shift @$d );
1839    float_is( $hit->significance, shift @$d );
1840    is( $hit->raw_score, shift @$d );
1841    is( $hit->bits,      shift @$d );
1842
1843    if ( $count == 0 ) {
1844        my $hsps_left = 1;
1845        while ( my $hsp = $hit->next_hsp ) {
1846            is( $hsp->query->start,    262 );
1847            is( $hsp->query->end,      552 );
1848            is( $hsp->hit->start,      1166897 );
1849            is( $hsp->hit->end,        1167187 );
1850            is( $hsp->length('total'), 291 );
1851            is( $hsp->hit_features,    'PyrR' );
1852            is( $hsp->start('hit'),    $hsp->hit->start );
1853            is( $hsp->end('query'),    $hsp->query->end );
1854            is( $hsp->strand('sbjct'), $hsp->subject->strand );  # alias for hit
1855            float_is( $hsp->evalue, 6e-59 );
1856            is( $hsp->score, 119 );
1857            is( $hsp->bits,  236 );
1858            is( sprintf( "%.2f", $hsp->percent_identity ),        85.22 );
1859            is( sprintf( "%.4f", $hsp->frac_identical('query') ), 0.8522 );
1860            is( sprintf( "%.4f", $hsp->frac_identical('hit') ),   0.8522 );
1861            is( $hsp->gaps, 0 );
1862            is( $hsp->n,    1 );
1863            $hsps_left--;
1864        }
1865        is( $hsps_left, 0 );
1866    }
1867    last if ( $count++ > @valid );
1868}
1869is( @valid, 0 );
1870
1871# Bug 2189
1872$searchio = Bio::SearchIO->new(
1873    -format => 'blast',
1874    -file   => test_input_file('blastp2215.blast')
1875);
1876
1877$result = $searchio->next_result;
1878is( $result->database_entries,  4460989 );
1879is( $result->database_letters,  1533424333 );
1880is( $result->algorithm,         'BLASTP' );
1881is( $result->algorithm_version, '2.2.15 [Oct-15-2006]' );
1882is( $result->rid,               '1169055516-21385-22799250964.BLASTQ4' );
1883is( $result->query_name,        'gi|15608519|ref|NP_215895.1|' );
1884is( $result->query_gi,          15608519 );
1885is( $result->query_length,      193 );
1886@hits = $result->hits;
1887is( scalar(@hits),          10 );
1888is( $hits[1]->accession,    '1W30' );
1889is( $hits[4]->significance, '2e-72' );
1890is( $hits[7]->bits,         '254' );
1891$result = $searchio->next_result;
1892is( $result->database_entries,  4460989 );
1893is( $result->database_letters,  1533424333 );
1894is( $result->algorithm,         'BLASTP' );
1895is( $result->algorithm_version, '2.2.15 [Oct-15-2006]' );
1896is( $result->query_name,        'gi|15595598|ref|NP_249092.1|' );
1897is( $result->query_length,      423 );
1898@hits = $result->hits;
1899is( scalar(@hits),          10 );
1900is( $hits[1]->accession,    'ZP_00972546' );
1901is( $hits[2]->ncbi_gi,      116054132 );
1902is( $hits[4]->significance, '0.0' );
1903is( $hits[7]->bits,         624 );
1904
1905# Bug 2246
1906$searchio = Bio::SearchIO->new(
1907    -format  => 'blast',
1908    -verbose => -1,
1909    -file    => test_input_file('bug2246.blast')
1910);
1911$result = $searchio->next_result;
1912is(
1913    $result->get_statistic(
1914        'number_of_hsps_better_than_expect_value_cutoff_without_gapping'),
1915    0
1916);
1917is( $result->get_statistic('number_of_hsps_gapped'),              7049 );
1918is( $result->get_statistic('number_of_hsps_successfully_gapped'), 55 );
1919is( $result->get_statistic('length_adjustment'),                  125 );
1920is( $result->get_statistic('querylength'),                        68 );
1921is( $result->get_statistic('effectivedblength'),                  1045382588 );
1922is( $result->get_statistic('effectivespace'),                     71086015984 );
1923is( $result->get_statistic('effectivespaceused'),                 71086015984 );
1924$hit = $result->next_hit;
1925is $hit->name,        'UniRef50_Q9X0H5';
1926is $hit->length,      0;
1927is $hit->accession,   'UniRef50_Q9X0H5';
1928is $hit->description, 'Cluster: Histidyl-tRNA synthetase; n=4; Thermoto...';
1929is $hit->bits,        23;
1930float_is( $hit->significance, 650 );
1931
1932# Bug 1986
1933$searchio = Bio::SearchIO->new(
1934    -format  => 'blast',
1935    -verbose => -1,
1936    -file    => test_input_file('bug1986.blastp')
1937);
1938$result = $searchio->next_result;
1939is( $result->get_statistic('querylength'),        335 );
1940is( $result->get_statistic('effectivedblength'),  18683311 );
1941is( $result->get_statistic('effectivespace'),     6258909185 );
1942is( $result->get_statistic('effectivespaceused'), 6258909185 );
1943$hit = $result->next_hit;
1944is $hit->name,      'ENSP00000350182';
1945is $hit->length,    425;
1946is $hit->accession, 'ENSP00000350182';
1947is $hit->description,
1948'pep:novel clone::BX322644.8:4905:15090:-1 gene:ENSG00000137397 transcript:ENST00000357569';
1949is $hit->raw_score, 301;
1950is $hit->bits,      120;
1951float_is( $hit->significance, 3e-27 );
1952$hit = $result->next_hit;
1953is $hit->name,      'ENSP00000327738';
1954is $hit->length,    468;
1955is $hit->accession, 'ENSP00000327738';
1956is $hit->description,
1957'pep:known-ccds chromosome:NCBI36:4:189297592:189305643:1 gene:ENSG00000184108 transcript:ENST00000332517 CCDS3851.1';
1958is $hit->raw_score, 289;
1959is $hit->bits,      115;
1960float_is( $hit->significance, 8e-26 );
1961
1962# Bug 1986, pt. 2
1963
1964# handle at least the first iteration with BLAST searches using databases
1965# containing non-unique IDs
1966
1967my $file = test_input_file('bug1986.blast2');
1968my %unique_accs;
1969open my $IN, '<', $file or die "Could not read file '$file': $!\n";
1970
1971while (<$IN>) {
1972    last if (/^Sequences/);
1973}
1974$count = 1;
1975while (<$IN>) {
1976    chomp;
1977    next if m{^\s*$};
1978    next unless ($_);
1979    last if m{^>};
1980    my ($accession) = split(/\s+/);
1981
1982    #print "Real Hit $count = $accession\n";
1983    $unique_accs{$accession}++;
1984
1985    #last if ($count == 10);
1986    ++$count;
1987}
1988close $IN;
1989
1990is( $count,                      495 );
1991is( scalar( keys %unique_accs ), 490 );
1992
1993my %search_accs;
1994
1995$searchio = Bio::SearchIO->new(
1996    -format  => 'blast',
1997    -verbose => -1,
1998    -file    => $file
1999);
2000$result = $searchio->next_result;
2001$count  = 1;
2002while ( my $hit = $result->next_hit ) {
2003    $search_accs{ $hit->accession }++;
2004    $count++;
2005}
2006
2007is( $count,                      495 );
2008is( scalar( keys %search_accs ), 490 );
2009
2010is_deeply( \%unique_accs, \%search_accs );
2011
2012# bug 2391 - long query names
2013
2014$file = test_input_file('bug2391.megablast');
2015
2016$searchio = Bio::SearchIO->new(
2017    -format => 'blast',
2018    -file   => $file
2019);
2020$result = $searchio->next_result;
2021
2022# data is getting munged up with long names
2023is( $result->query_name,
2024    'c6_COX;c6_QBL;6|31508172;31503325;31478402|rs36223351|1|dbSNP|C/G' );
2025is( $result->query_description, '' );
2026is( $result->algorithm,         'MEGABLAST' );
2027is(
2028    $result->get_statistic(
2029        'number_of_hsps_better_than_expect_value_cutoff_without_gapping'),
2030    undef
2031);
2032is( $result->get_statistic('number_of_hsps_gapped'),              0 );
2033is( $result->get_statistic('number_of_hsps_successfully_gapped'), 0 );
2034is( $result->get_statistic('length_adjustment'),                  16 );
2035is( $result->get_statistic('querylength'),                        85 );
2036is( $result->get_statistic('effectivedblength'),                  59358266 );
2037is( $result->get_statistic('effectivespace'),                     5045452610 );
2038is( $result->get_statistic('effectivespaceused'),                 5045452610 );
2039
2040# bug 2399 - catching Expect(n) values
2041
2042$file = test_input_file('bug2399.tblastn');
2043
2044$searchio = Bio::SearchIO->new(
2045    -format => 'blast',
2046    -file   => $file
2047);
2048my $total_n = 0;
2049while ( my $query = $searchio->next_result ) {
2050    while ( my $subject = $query->next_hit ) {
2051        $total_n += grep { $_->n } $subject->hsps;
2052    }
2053}
2054is( $total_n, 80 );    # n = at least 1, so this was changed to reflect that
2055
2056sub cmp_evalue ($$) {
2057    my ( $tval, $aval ) = @_;
2058    is( sprintf( "%g", $tval ), sprintf( "%g", $aval ) );
2059}
2060
2061# bug 3064 - All-gap Query/Subject lines for BLAST+ do not have numbering
2062
2063$file = test_input_file('blast_plus.blastp');
2064
2065$searchio = Bio::SearchIO->new(
2066    -format => 'blast',
2067    -file   => $file
2068);
2069
2070my $total_hsps = 0;
2071while ( my $query = $searchio->next_result ) {
2072    is( $query->get_statistic('querylength'),        undef );
2073    is( $query->get_statistic('effectivedblength'),  undef );
2074    is( $query->get_statistic('effectivespace'),     undef );
2075    is( $query->get_statistic('effectivespaceused'), 55770 );
2076    while ( my $subject = $query->next_hit ) {
2077        while ( my $hsp = $subject->next_hsp ) {
2078            $total_hsps++;
2079            if ( $total_hsps == 1 ) {
2080                is( $hsp->start('query'),         5 );
2081                is( $hsp->start('hit'),           3 );
2082                is( $hsp->end('query'),           220 );
2083                is( $hsp->end('hit'),             308 );
2084                is( length( $hsp->query_string ), length( $hsp->hit_string ) );
2085            }
2086        }
2087    }
2088}
2089
2090is( $total_hsps, 2 );
2091
2092# BLAST 2.2.20+ output file ZABJ4EA7014.CH878695.1.blast.txt
2093# Tests SearchIO blast parsing of 'Features in/flanking this part of a subject sequence'
2094$searchio = Bio::SearchIO->new(
2095    -format => 'blast',
2096    -file   => test_input_file('ZABJ4EA7014.CH878695.1.blast.txt')
2097);
2098
2099$result = $searchio->next_result;
2100
2101# Parse BLAST header details
2102is( $result->algorithm,         'BLASTN' );
2103is( $result->algorithm_version, '2.2.20+' );
2104like($result->algorithm_reference, qr/A greedy algorithm for aligning DNA\s+sequences/);
2105is( $result->database_name,
2106    'human build 35 genome database (reference assembly only)' );
2107is( $result->database_entries, 378 );
2108is( $result->database_letters, 2866055344 );
2109is( $result->query_name,       'gi|95131563|gb|CH878695.1|' );
2110is( $result->query_description,
2111    'Homo sapiens 211000035829648 genomic scaffold' );
2112is( $result->query_length, 29324 );
2113
2114# Parse BLAST footer details
2115is( $result->get_statistic('posted_date'),    'Jul 26, 2007  3:20 PM' );
2116is( $result->get_statistic('dbletters'),      -1428911948 );
2117is( $result->get_statistic('lambda'),         '1.33' );
2118is( $result->get_statistic('kappa'),          '0.621' );
2119is( $result->get_statistic('entropy'),        '1.12' );
2120is( $result->get_statistic('lambda_gapped'),  '1.28' );
2121is( $result->get_statistic('kappa_gapped'),   '0.460' );
2122is( $result->get_statistic('entropy_gapped'), '0.850' );
2123is( $result->get_parameter('matrix'),         'blastn matrix:1 -2' );
2124is( $result->get_parameter('gapopen'),        0 );
2125is( $result->get_parameter('gapext'),         0 );
2126is( $result->get_statistic('num_extensions'), 216 );
2127is( $result->get_statistic('num_successful_extensions'), 216 );
2128is( $result->get_parameter('expect'),                    '0.01' );
2129is( $result->get_statistic('seqs_better_than_cutoff'),   10 );
2130is(
2131    $result->get_statistic(
2132        'number_of_hsps_better_than_expect_value_cutoff_without_gapping'),
2133    0
2134);
2135is( $result->get_statistic('number_of_hsps_gapped'),              216 );
2136is( $result->get_statistic('number_of_hsps_successfully_gapped'), 212 );
2137is( $result->get_statistic('length_adjustment'),                  34 );
2138is( $result->get_statistic('querylength'),                        29290 );
2139is( $result->get_statistic('effectivedblength'),                  2866042492 );
2140is( $result->get_statistic('effectivespace'),     83946384590680 );
2141is( $result->get_statistic('effectivespaceused'), 83946384590680 );
2142is( $result->get_statistic('A'),                  0 );
2143is( $result->get_statistic('X1'),                 23 );
2144is( $result->get_statistic('X1_bits'),            '44.2' );
2145is( $result->get_statistic('X2'),                 32 );
2146is( $result->get_statistic('X2_bits'),            '59.1' );
2147is( $result->get_statistic('X3'),                 54 );
2148is( $result->get_statistic('X3_bits'),            '99.7' );
2149is( $result->get_statistic('S1'),                 23 );
2150is( $result->get_statistic('S1_bits'),            '43.6' );
2151is( $result->get_statistic('S2'),                 29 );
2152is( $result->get_statistic('S2_bits'),            '54.7' );
2153
2154# Skip the 1st hit. It doesn't have any 'Features in/flanking this part of subject sequence:'
2155$hit = $result->next_hit;
2156
2157# The 2nd hit has hsps with 'Features flanking this part of subject sequence:'
2158$hit = $result->next_hit;
2159is( $hit->name,        'gi|51459264|ref|NT_077382.3|Hs1_77431' );
2160is( $hit->description, 'Homo sapiens chromosome 1 genomic contig' );
2161is( $hit->length,      237250 );
2162
2163# In the 2nd hit, look at the 1st hsp
2164$hsp = $hit->next_hsp;
2165is( $hsp->hit_features,
2166"16338 bp at 5' side: PRAME family member 8   11926 bp at 3' side: PRAME family member 9"
2167);
2168is( $hsp->bits,          7286 );
2169is( $hsp->score,         3945 );
2170is( $hsp->expect,        '0.0' );
2171is( $hsp->hsp_length,    6145 );
2172is( $hsp->num_identical, 5437 );
2173is( int sprintf( "%.2f", $hsp->percent_identity ), 88 );
2174is( $hsp->gaps,           152 );
2175is( $hsp->start('query'), 23225 );
2176is( $hsp->start('sbjct'), 86128 );
2177is( $hsp->end('query'),   29324 );
2178is( $hsp->end('sbjct'),   92165 );
2179
2180# In the 2nd hit, look at the 2nd hsp
2181$hsp = $hit->next_hsp;
2182is( $hsp->hit_features,
2183"25773 bp at 5' side: PRAME family member 3   3198 bp at 3' side: PRAME family member 5"
2184);
2185is( $hsp->bits,          4732 );
2186is( $hsp->score,         2562 );
2187is( $hsp->expect,        '0.0' );
2188is( $hsp->hsp_length,    4367 );
2189is( $hsp->num_identical, 3795 );
2190is( int sprintf( "%.2f", $hsp->percent_identity ), 86 );
2191is( $hsp->gaps,           178 );
2192is( $hsp->start('query'), 23894 );
2193is( $hsp->start('sbjct'), 37526 );
2194is( $hsp->end('query'),   28193 );
2195is( $hsp->end('sbjct'),   41781 );
2196
2197# In the 2nd hit, look at the 3rd hsp
2198$hsp = $hit->next_hsp;
2199is( $hsp->hit_features,
2200"16338 bp at 5' side: PRAME family member 8   14600 bp at 3' side: PRAME family member 9"
2201);
2202is( $hsp->bits,          3825 );
2203is( $hsp->score,         2071 );
2204is( $hsp->expect,        '0.0' );
2205is( $hsp->hsp_length,    3406 );
2206is( $hsp->num_identical, 2976 );
2207is( int sprintf( "%.2f", $hsp->percent_identity ), 87 );
2208is( $hsp->gaps,           89 );
2209is( $hsp->start('query'), 14528 );
2210is( $hsp->start('sbjct'), 86128 );
2211is( $hsp->end('query'),   17886 );
2212is( $hsp->end('sbjct'),   89491 );
2213
2214# In the 2nd hit, look at the 4th hsp
2215$hsp = $hit->next_hsp;
2216is( $hsp->hit_features,
2217"29101 bp at 5' side: PRAME family member 8   2120 bp at 3' side: PRAME family member 9"
2218);
2219is( $hsp->bits,          3241 );
2220is( $hsp->score,         1755 );
2221is( $hsp->expect,        '0.0' );
2222is( $hsp->hsp_length,    3158 );
2223is( $hsp->num_identical, 2711 );
2224is( int sprintf( "%.2f", $hsp->percent_identity ), 85 );
2225is( $hsp->gaps,           123 );
2226is( $hsp->start('query'), 23894 );
2227is( $hsp->start('sbjct'), 98891 );
2228is( $hsp->end('query'),   27005 );
2229is( $hsp->end('sbjct'),   101971 );
2230
2231# In the 2nd hit, look at the 5th hsp
2232$hsp = $hit->next_hsp;
2233is( $hsp->hit_features,  "PRAME family member 13" );
2234is( $hsp->bits,          3142 );
2235is( $hsp->score,         1701 );
2236is( $hsp->expect,        '0.0' );
2237is( $hsp->hsp_length,    2507 );
2238is( $hsp->num_identical, 2249 );
2239is( int sprintf( "%.2f", $hsp->percent_identity ), 89 );
2240is( $hsp->gaps,           63 );
2241is( $hsp->start('query'), 3255 );
2242is( $hsp->start('sbjct'), 128516 );
2243is( $hsp->end('query'),   5720 );
2244is( $hsp->end('sbjct'),   131000 );
2245
2246# testing for Bug #3298
2247$searchio = Bio::SearchIO->new(
2248    '-format' => 'blast',
2249    '-file'   => test_input_file('multiresult_blastn+.bls')
2250);
2251
2252is ($searchio->next_result->algorithm_version, '2.2.25+', "testing Bug 3298");
2253is ($searchio->next_result->algorithm_version, '2.2.25+', "testing Bug 3298");
2254is ($searchio->next_result->algorithm_version, '2.2.25+', "testing Bug 3298");
2255
2256# testing for Bug #3251
2257$searchio = Bio::SearchIO->new(
2258    '-format' => 'blast',
2259    '-file'   => test_input_file('rpsblast_no_hits.bls')
2260);
2261
2262is ($searchio->next_result->database_name, 'CDD.v.2.13', "testing Bug 3251");
2263is ($searchio->next_result->database_name, 'CDD.v.2.13', "testing Bug 3251");
2264is ($searchio->next_result->database_name, 'CDD.v.2.13', "testing Bug 3251");
2265