1# -*-Perl-*- Test Harness script for Bioperl
2# $Id: SearchIO_blast_pull.t 11525 2007-06-27 10:16:38Z sendu $
3
4use strict;
5
6BEGIN {
7    use Bio::Root::Test;
8
9    test_begin(-tests => 289);
10
11	use_ok('Bio::SearchIO');
12}
13
14
15my $searchio = Bio::SearchIO->new(-format => 'blast_pull', -file => test_input_file('new_blastn.txt'));
16
17my $result = $searchio->next_result;
18is $result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences)';
19is $result->database_entries, 3742891;
20is $result->database_letters, 16670205594;
21is $result->algorithm, 'BLASTN';
22is $result->algorithm_version, '2.2.13 [Nov-27-2005]';
23is $result->query_name, 'pyrR,';
24is $result->query_length, 558;
25is $result->get_statistic('kappa'), 0.711;
26is $result->get_statistic('kappa_gapped'), 0.711;
27is $result->get_statistic('lambda'), 1.37;
28is $result->get_statistic('lambda_gapped'), 1.37;
29is $result->get_statistic('entropy'), 1.31;
30is $result->get_statistic('entropy_gapped'), 1.31;
31is $result->get_statistic('dbletters'), -509663586;
32is $result->get_statistic('dbentries'), 3742891;
33is $result->get_statistic('effectivespace'), 8935230198384;
34is $result->get_parameter('matrix'), 'blastn matrix:1 -3';
35is $result->get_parameter('gapopen'), 5;
36is $result->get_parameter('gapext'), 2;
37is $result->get_statistic('S2'), 60;
38is $result->get_statistic('S2_bits'), 119.4;
39float_is $result->get_parameter('expect'), 1e-23;
40is $result->get_statistic('num_extensions'), 117843;
41
42my @valid = ( [ 'gi|41400296|gb|AE016958.1|', 4829781, 'AE016958', '6e-059', 236],
43	      [ 'gi|54013472|dbj|AP006618.1|', 6021225, 'AP006618', '4e-026', 127],
44	      [ 'gi|57546753|dbj|BA000030.2|', 9025608, 'BA000030', '1e-023', 119]);
45my $count = 0;
46while (my $hit = $result->next_hit ) {
47    my $d = shift @valid;
48
49    is $hit->name, shift @$d;
50    is $hit->length, shift @$d;
51    is $hit->accession, shift @$d;
52    float_is($hit->significance, shift @$d);
53    is $hit->raw_score, shift @$d;
54
55    if( $count == 0 ) {
56        my $hsps_left = 1;
57        while( my $hsp = $hit->next_hsp ) {
58            is $hsp->query->start, 262;
59            is $hsp->query->end, 552;
60            is $hsp->hit->start, 1166897;
61            is $hsp->hit->end, 1167187;
62            is $hsp->length('hsp'), 291;
63            is $hsp->start('hit'), $hsp->hit->start;
64            is $hsp->end('query'), $hsp->query->end;
65            is $hsp->strand('sbjct'), $hsp->subject->strand;# alias for hit
66            float_is($hsp->evalue, 6e-59);
67            is $hsp->score, 119;
68            is $hsp->bits,236;
69            is sprintf("%.1f",$hsp->percent_identity), 85.2;
70            is sprintf("%.3f",$hsp->frac_identical('query')), 0.852;
71            is sprintf("%.3f",$hsp->frac_identical('hit')), 0.852;
72            is $hsp->gaps, 0;
73            $hsps_left--;
74        }
75        is $hsps_left, 0;
76    }
77    last if( $count++ > @valid );
78}
79is @valid, 0;
80
81# descriptionless hit
82$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
83							   '-file' => test_input_file('blast_no_hit_desc.txt'));
84
85$result = $searchio->next_result;
86my $hit = $result->next_hit;
87is $hit->name, 'chr1';
88is $hit->description, '';
89
90# further (NCBI blastn/p) tests taken from SearchIO.t
91$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
92							   '-file' => test_input_file('ecolitst.bls'));
93
94$result = $searchio->next_result;
95
96is($result->database_name, 'ecoli.aa', 'database_name()');
97is($result->database_entries, 4289);
98is($result->database_letters, 1358990);
99
100is($result->algorithm, 'BLASTP');
101like($result->algorithm_version, qr/^2\.1\.3/);
102like($result->query_name, qr/gi|1786183|gb|AAC73113.1| (AE000111) aspartokinase I,\s+homoserine dehydrogenase I [Escherichia coli]/);
103is($result->query_length, 820);
104is($result->get_statistic('kappa'), '0.135');
105is($result->get_statistic('kappa_gapped'), '0.0410');
106is($result->get_statistic('lambda'), '0.319');
107is($result->get_statistic('lambda_gapped'), '0.267');
108is($result->get_statistic('entropy'), '0.383');
109is($result->get_statistic('entropy_gapped'), '0.140');
110
111is($result->get_statistic('dbletters'), 1358990);
112is($result->get_statistic('dbentries'), 4289);
113is($result->get_statistic('effective_hsplength'), 47);
114is($result->get_statistic('effectivespace'), 894675611);
115is($result->get_parameter('matrix'), 'BLOSUM62');
116is($result->get_parameter('gapopen'), 11);
117is($result->get_parameter('gapext'), 1);
118is($result->get_statistic('S2'), '92');
119is($result->get_statistic('S2_bits'), '40.0');
120float_is($result->get_parameter('expect'), '1.0e-03');
121is($result->get_statistic('num_extensions'), '82424');
122
123
124@valid = ( [ 'gb|AAC73113.1|', 820, 'AAC73113', '0', 1567],
125	      [ 'gb|AAC76922.1|', 810, 'AAC76922', '1e-91', 332],
126	      [ 'gb|AAC76994.1|', 449, 'AAC76994', '3e-47', 184]);
127$count = 0;
128while (my $hit = $result->next_hit) {
129    my $d = shift @valid;
130
131    is($hit->name, shift @$d);
132    is($hit->length, shift @$d);
133    is($hit->accession, shift @$d);
134    float_is($hit->significance, shift @$d);
135    is($hit->raw_score, shift @$d );
136
137    if( $count == 0 ) {
138        my $hsps_left = 1;
139        while (my $hsp = $hit->next_hsp) {
140            is($hsp->query->start, 1);
141            is($hsp->query->end, 820);
142            is($hsp->hit->start, 1);
143            is($hsp->hit->end, 820);
144            is($hsp->length('hsp'), 820);
145            is($hsp->start('hit'), $hsp->hit->start);
146            is($hsp->end('query'), $hsp->query->end);
147            is($hsp->strand('sbjct'), $hsp->subject->strand);# alias for hit
148            float_is($hsp->evalue, 0.0);
149            is($hsp->score, 4058);
150            is($hsp->bits,1567);
151            is(sprintf("%.2f",$hsp->percent_identity), 98.29);
152            is(sprintf("%.4f",$hsp->frac_identical('query')), 0.9829);
153            is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.9829);
154            is($hsp->gaps, 0);
155            $hsps_left--;
156        }
157        is($hsps_left, 0);
158    }
159    last if( $count++ > @valid );
160}
161is(@valid, 0);
162
163$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
164							  '-file'   => test_input_file('a_thaliana.blastn'));
165
166$result = $searchio->next_result;
167is($result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS, GSS,or phase 0, 1 or 2 HTGS sequences)');
168is($result->database_letters, 4677375331);
169is($result->database_entries, 1083200);
170is($result->algorithm, 'BLASTN');
171like($result->algorithm_version, qr/^2\.2\.1/);
172is($result->query_name, '');
173is($result->query_length, 60);
174is($result->get_parameter('gapopen'), 5);
175is($result->get_parameter('gapext'), 2);
176is($result->get_parameter('ktup'), undef);
177
178is($result->get_statistic('lambda'), 1.37);
179is($result->get_statistic('kappa'), 0.711);
180is($result->get_statistic('entropy'),1.31 );
181is($result->get_statistic('T'), 0);
182is($result->get_statistic('A'), 30);
183is($result->get_statistic('X1'), '6');
184is($result->get_statistic('X1_bits'), 11.9);
185is($result->get_statistic('X2'), 15);
186is($result->get_statistic('X2_bits'), 29.7);
187is($result->get_statistic('S1'), 12);
188is($result->get_statistic('S1_bits'), 24.3);
189is($result->get_statistic('S2'), 17);
190is($result->get_statistic('S2_bits'), 34.2);
191
192is($result->get_statistic('dbentries'), 1083200);
193
194@valid = ( [ 'gb|AY052359.1|', 2826, 'AY052359', '3e-18', 96, 1, 60,
195	     '1.0000'],
196	   [ 'gb|AC002329.2|AC002329', 76170, 'AC002329', '3e-18', 96, 1, 60,
197	     '1.0000' ],
198	   [ 'gb|AF132318.1|AF132318', 5383, 'AF132318', '0.04', 42, 35, 55,
199	     '0.3500']);
200$count = 0;
201
202while( my $hit = $result->next_hit ) {
203    my $d = shift @valid;
204    is($hit->name, shift @$d);
205    is($hit->length, shift @$d);
206    is($hit->accession, shift @$d);
207    float_is($hit->significance, shift @$d);
208    is($hit->raw_score, shift @$d );
209    is($hit->start, shift @$d);
210    is($hit->end,shift @$d);
211    is(sprintf("%.4f",$hit->frac_aligned_query), shift @$d);
212    if( $count == 0 ) {
213        my $hsps_left = 1;
214        while( my $hsp = $hit->next_hsp ) {
215            is($hsp->query->start, 1);
216            is($hsp->query->end, 60);
217            is($hsp->query->strand, 1);
218            is($hsp->hit->start, 154);
219            is($hsp->hit->end, 212);
220            is($hsp->hit->strand, 1);
221            is($hsp->length('hsp'), 60);
222            float_is($hsp->evalue, 3e-18);
223            is($hsp->score, 48);
224            is($hsp->bits,95.6);
225            is(sprintf("%.2f",$hsp->percent_identity), 96.67);
226            is(sprintf("%.4f",$hsp->frac_identical('query')), 0.9667);
227            is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.9831);
228            is($hsp->query->frame(), 0);
229            is($hsp->hit->frame(), 0);
230            is($hsp->query->seq_id, '');
231            is($hsp->hit->seq_id, 'gb|AY052359.1|');
232            is($hsp->gaps('query'), 0);
233            is($hsp->gaps('hit'), 1);
234            is($hsp->gaps, 1);
235            is($hsp->query_string, 'aggaatgctgtttaattggaatcgtacaatggagaatttgacggaaatagaatcaacgat');
236            is($hsp->hit_string, 'aggaatgctgtttaattggaatca-acaatggagaatttgacggaaatagaatcaacgat');
237            is($hsp->homology_string, '|||||||||||||||||||||||  |||||||||||||||||||||||||||||||||||');
238            is(sprintf("%.2f",$hsp->get_aln->overall_percentage_identity), 96.67);
239            is(sprintf("%.2f",$hsp->get_aln->percentage_identity), 98.31);
240            $hsps_left--;
241        }
242        is($hsps_left, 0);
243    }
244    last if( $count++ > @valid );
245}
246is(@valid, 0);
247
248$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
249							  '-file'   => test_input_file('frac_problems.blast'));
250my @expected = ("1.000", "0.943");
251while (my $result = $searchio->next_result) {
252    my $hit = $result->next_hit;
253	if (@expected == 2) {
254		is($hit->frac_identical, shift @expected);
255	}
256	else {
257		TODO: {
258			local $TODO = 'frac_identical failing!';
259			is($hit->frac_identical, shift @expected);
260		}
261	}
262}
263is(@expected, 0);
264
265# And even more: frac_aligned_query should never be over 1!
266$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
267							  '-file'   => test_input_file('frac_problems2.blast'));
268$result = $searchio->next_result;
269$hit = $result->next_hit;
270is $hit->frac_aligned_query, 0.97;
271
272# Also, start and end should be sane
273$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
274							  '-file'   => test_input_file('frac_problems3.blast'));
275$result = $searchio->next_result;
276$hit = $result->next_hit;
277is $hit->start('sbjct'), 207;
278is $hit->end('sbjct'), 1051;
279
280# Do a multiblast report test
281$searchio = Bio::SearchIO->new('-format' => 'blast_pull',
282							   '-file'   => test_input_file('multi_blast.bls'));
283
284@expected = qw(CATH_RAT CATL_HUMAN CATL_RAT PAPA_CARPA);
285my $results_left = 4;
286while( my $result = $searchio->next_result ) {
287    is($result->query_name, shift @expected, "Multiblast query test");
288    $results_left--;
289}
290is($results_left, 0);
291
292# Web blast result parsing
293$searchio = Bio::SearchIO->new(-format => 'blast_pull',
294							   -file   => test_input_file('catalase-webblast.BLASTP'));
295ok($result = $searchio->next_result);
296ok($hit = $result->next_hit);
297is($hit->name, 'gi|40747822|gb|EAA66978.1|', 'full hit name');
298is($hit->accession, 'EAA66978', 'hit accession');
299ok(my $hsp = $hit->next_hsp);
300is($hsp->query->start, 1, 'query start');
301is($hsp->query->end, 528, 'query start');
302
303# tests for new BLAST 2.2.13 output
304$searchio = Bio::SearchIO->new(-format => 'blast_pull',
305							  -file   => test_input_file('new_blastn.txt'));
306
307$result = $searchio->next_result;
308is($result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences)');
309is($result->database_entries, 3742891);
310is($result->database_letters, 16670205594);
311is($result->algorithm, 'BLASTN');
312is($result->algorithm_version, '2.2.13 [Nov-27-2005]');
313is($result->query_name, 'pyrR,');
314is($result->query_length, 558);
315is($result->get_statistic('kappa'), '0.711');
316is($result->get_statistic('kappa_gapped'), '0.711');
317is($result->get_statistic('lambda'), '1.37');
318is($result->get_statistic('lambda_gapped'), '1.37');
319is($result->get_statistic('entropy'), '1.31');
320is($result->get_statistic('entropy_gapped'), '1.31');
321is($result->get_statistic('dbletters'), '-509663586');
322is($result->get_statistic('dbentries'), 3742891);
323is($result->get_statistic('effective_hsplength'), undef);
324is($result->get_statistic('effectivespace'), 8935230198384);
325is($result->get_parameter('matrix'), 'blastn matrix:1 -3');
326is($result->get_parameter('gapopen'), 5);
327is($result->get_parameter('gapext'), 2);
328is($result->get_statistic('S2'), '60');
329is($result->get_statistic('S2_bits'), '119.4');
330float_is($result->get_parameter('expect'), '1e-23');
331is($result->get_statistic('num_extensions'), '117843');
332
333@valid = ( [ 'gi|41400296|gb|AE016958.1|', 4829781, 'AE016958', '6e-059', 236],
334	      [ 'gi|54013472|dbj|AP006618.1|', 6021225, 'AP006618', '4e-026', 127],
335	      [ 'gi|57546753|dbj|BA000030.2|', 9025608, 'BA000030', '1e-023', 119]);
336$count = 0;
337while( $hit = $result->next_hit ) {
338    my $d = shift @valid;
339
340    is($hit->name, shift @$d);
341    is($hit->length, shift @$d);
342    is($hit->accession, shift @$d);
343    float_is($hit->significance, shift @$d);
344    is($hit->raw_score, shift @$d );
345
346    if( $count == 0 ) {
347        my $hsps_left = 1;
348        while( my $hsp = $hit->next_hsp ) {
349            is($hsp->query->start, 262);
350            is($hsp->query->end, 552);
351            is($hsp->hit->start, 1166897);
352            is($hsp->hit->end, 1167187);
353            is($hsp->length('hsp'), 291);
354            is($hsp->start('hit'), $hsp->hit->start);
355            is($hsp->end('query'), $hsp->query->end);
356            is($hsp->strand('sbjct'), $hsp->subject->strand);# alias for hit
357            float_is($hsp->evalue, 6e-59);
358            is($hsp->score, 119);
359            is($hsp->bits,236);
360            is(sprintf("%.2f",$hsp->percent_identity), 85.22);
361            is(sprintf("%.4f",$hsp->frac_identical('query')), 0.8522);
362            is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.8522);
363            is($hsp->gaps, 0);
364            $hsps_left--;
365        }
366        is($hsps_left, 0);
367    }
368    last if( $count++ > @valid );
369}
370is(@valid, 0);
371
372# Bug 2189
373$searchio = Bio::SearchIO->new(-format => 'blast_pull',
374							  -file   => test_input_file('blastp2215.blast'));
375
376$result = $searchio->next_result;
377is($result->database_entries, 4460989);
378is($result->database_letters, 1533424333);
379is($result->algorithm, 'BLASTP');
380is($result->algorithm_version, '2.2.15 [Oct-15-2006]');
381is($result->query_name, 'gi|15608519|ref|NP_215895.1|');
382is($result->query_length, 193);
383my @hits = $result->hits;
384is(scalar(@hits), 10);
385is($hits[1]->accession,'1W30');
386float_is($hits[4]->significance,'2e-72');
387is($hits[7]->score,'254');
388$result = $searchio->next_result;
389is($result->database_entries, 4460989);
390is($result->database_letters, 1533424333);
391is($result->algorithm, 'BLASTP');
392is($result->algorithm_version, '2.2.15 [Oct-15-2006]');
393is($result->query_name, 'gi|15595598|ref|NP_249092.1|');
394is($result->query_length, 423);
395@hits = $result->hits;
396is(scalar(@hits), 10);
397is($hits[1]->accession,'ZP_00972546');
398float_is($hits[4]->significance, '0.0');
399is($hits[7]->score, 624);
400
401# Bug 2246
402$searchio = Bio::SearchIO->new(-format => 'blast_pull',
403                               -verbose => -1,
404							  -file   => test_input_file('bug2246.blast'));
405$result = $searchio->next_result;
406$hit = $result->next_hit;
407is $hit->name, 'UniRef50_Q9X0H5';
408is $hit->length, undef;
409is $hit->accession, 'UniRef50_Q9X0H5';
410is $hit->description, 'Cluster: Histidyl-tRNA synthetase; n=4; Thermoto...';
411is $hit->raw_score, 23;
412float_is($hit->significance, 650);
413
414#*** need to /fully/ test a multi-result, multi-hit, multi-hsp report!
415