1# -*-Perl-*- Test Harness script for Bioperl 2# $Id: SearchIO_blast_pull.t 11525 2007-06-27 10:16:38Z sendu $ 3 4use strict; 5 6BEGIN { 7 use Bio::Root::Test; 8 9 test_begin(-tests => 289); 10 11 use_ok('Bio::SearchIO'); 12} 13 14 15my $searchio = Bio::SearchIO->new(-format => 'blast_pull', -file => test_input_file('new_blastn.txt')); 16 17my $result = $searchio->next_result; 18is $result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences)'; 19is $result->database_entries, 3742891; 20is $result->database_letters, 16670205594; 21is $result->algorithm, 'BLASTN'; 22is $result->algorithm_version, '2.2.13 [Nov-27-2005]'; 23is $result->query_name, 'pyrR,'; 24is $result->query_length, 558; 25is $result->get_statistic('kappa'), 0.711; 26is $result->get_statistic('kappa_gapped'), 0.711; 27is $result->get_statistic('lambda'), 1.37; 28is $result->get_statistic('lambda_gapped'), 1.37; 29is $result->get_statistic('entropy'), 1.31; 30is $result->get_statistic('entropy_gapped'), 1.31; 31is $result->get_statistic('dbletters'), -509663586; 32is $result->get_statistic('dbentries'), 3742891; 33is $result->get_statistic('effectivespace'), 8935230198384; 34is $result->get_parameter('matrix'), 'blastn matrix:1 -3'; 35is $result->get_parameter('gapopen'), 5; 36is $result->get_parameter('gapext'), 2; 37is $result->get_statistic('S2'), 60; 38is $result->get_statistic('S2_bits'), 119.4; 39float_is $result->get_parameter('expect'), 1e-23; 40is $result->get_statistic('num_extensions'), 117843; 41 42my @valid = ( [ 'gi|41400296|gb|AE016958.1|', 4829781, 'AE016958', '6e-059', 236], 43 [ 'gi|54013472|dbj|AP006618.1|', 6021225, 'AP006618', '4e-026', 127], 44 [ 'gi|57546753|dbj|BA000030.2|', 9025608, 'BA000030', '1e-023', 119]); 45my $count = 0; 46while (my $hit = $result->next_hit ) { 47 my $d = shift @valid; 48 49 is $hit->name, shift @$d; 50 is $hit->length, shift @$d; 51 is $hit->accession, shift @$d; 52 float_is($hit->significance, shift @$d); 53 is $hit->raw_score, shift @$d; 54 55 if( $count == 0 ) { 56 my $hsps_left = 1; 57 while( my $hsp = $hit->next_hsp ) { 58 is $hsp->query->start, 262; 59 is $hsp->query->end, 552; 60 is $hsp->hit->start, 1166897; 61 is $hsp->hit->end, 1167187; 62 is $hsp->length('hsp'), 291; 63 is $hsp->start('hit'), $hsp->hit->start; 64 is $hsp->end('query'), $hsp->query->end; 65 is $hsp->strand('sbjct'), $hsp->subject->strand;# alias for hit 66 float_is($hsp->evalue, 6e-59); 67 is $hsp->score, 119; 68 is $hsp->bits,236; 69 is sprintf("%.1f",$hsp->percent_identity), 85.2; 70 is sprintf("%.3f",$hsp->frac_identical('query')), 0.852; 71 is sprintf("%.3f",$hsp->frac_identical('hit')), 0.852; 72 is $hsp->gaps, 0; 73 $hsps_left--; 74 } 75 is $hsps_left, 0; 76 } 77 last if( $count++ > @valid ); 78} 79is @valid, 0; 80 81# descriptionless hit 82$searchio = Bio::SearchIO->new('-format' => 'blast_pull', 83 '-file' => test_input_file('blast_no_hit_desc.txt')); 84 85$result = $searchio->next_result; 86my $hit = $result->next_hit; 87is $hit->name, 'chr1'; 88is $hit->description, ''; 89 90# further (NCBI blastn/p) tests taken from SearchIO.t 91$searchio = Bio::SearchIO->new('-format' => 'blast_pull', 92 '-file' => test_input_file('ecolitst.bls')); 93 94$result = $searchio->next_result; 95 96is($result->database_name, 'ecoli.aa', 'database_name()'); 97is($result->database_entries, 4289); 98is($result->database_letters, 1358990); 99 100is($result->algorithm, 'BLASTP'); 101like($result->algorithm_version, qr/^2\.1\.3/); 102like($result->query_name, qr/gi|1786183|gb|AAC73113.1| (AE000111) aspartokinase I,\s+homoserine dehydrogenase I [Escherichia coli]/); 103is($result->query_length, 820); 104is($result->get_statistic('kappa'), '0.135'); 105is($result->get_statistic('kappa_gapped'), '0.0410'); 106is($result->get_statistic('lambda'), '0.319'); 107is($result->get_statistic('lambda_gapped'), '0.267'); 108is($result->get_statistic('entropy'), '0.383'); 109is($result->get_statistic('entropy_gapped'), '0.140'); 110 111is($result->get_statistic('dbletters'), 1358990); 112is($result->get_statistic('dbentries'), 4289); 113is($result->get_statistic('effective_hsplength'), 47); 114is($result->get_statistic('effectivespace'), 894675611); 115is($result->get_parameter('matrix'), 'BLOSUM62'); 116is($result->get_parameter('gapopen'), 11); 117is($result->get_parameter('gapext'), 1); 118is($result->get_statistic('S2'), '92'); 119is($result->get_statistic('S2_bits'), '40.0'); 120float_is($result->get_parameter('expect'), '1.0e-03'); 121is($result->get_statistic('num_extensions'), '82424'); 122 123 124@valid = ( [ 'gb|AAC73113.1|', 820, 'AAC73113', '0', 1567], 125 [ 'gb|AAC76922.1|', 810, 'AAC76922', '1e-91', 332], 126 [ 'gb|AAC76994.1|', 449, 'AAC76994', '3e-47', 184]); 127$count = 0; 128while (my $hit = $result->next_hit) { 129 my $d = shift @valid; 130 131 is($hit->name, shift @$d); 132 is($hit->length, shift @$d); 133 is($hit->accession, shift @$d); 134 float_is($hit->significance, shift @$d); 135 is($hit->raw_score, shift @$d ); 136 137 if( $count == 0 ) { 138 my $hsps_left = 1; 139 while (my $hsp = $hit->next_hsp) { 140 is($hsp->query->start, 1); 141 is($hsp->query->end, 820); 142 is($hsp->hit->start, 1); 143 is($hsp->hit->end, 820); 144 is($hsp->length('hsp'), 820); 145 is($hsp->start('hit'), $hsp->hit->start); 146 is($hsp->end('query'), $hsp->query->end); 147 is($hsp->strand('sbjct'), $hsp->subject->strand);# alias for hit 148 float_is($hsp->evalue, 0.0); 149 is($hsp->score, 4058); 150 is($hsp->bits,1567); 151 is(sprintf("%.2f",$hsp->percent_identity), 98.29); 152 is(sprintf("%.4f",$hsp->frac_identical('query')), 0.9829); 153 is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.9829); 154 is($hsp->gaps, 0); 155 $hsps_left--; 156 } 157 is($hsps_left, 0); 158 } 159 last if( $count++ > @valid ); 160} 161is(@valid, 0); 162 163$searchio = Bio::SearchIO->new('-format' => 'blast_pull', 164 '-file' => test_input_file('a_thaliana.blastn')); 165 166$result = $searchio->next_result; 167is($result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS, GSS,or phase 0, 1 or 2 HTGS sequences)'); 168is($result->database_letters, 4677375331); 169is($result->database_entries, 1083200); 170is($result->algorithm, 'BLASTN'); 171like($result->algorithm_version, qr/^2\.2\.1/); 172is($result->query_name, ''); 173is($result->query_length, 60); 174is($result->get_parameter('gapopen'), 5); 175is($result->get_parameter('gapext'), 2); 176is($result->get_parameter('ktup'), undef); 177 178is($result->get_statistic('lambda'), 1.37); 179is($result->get_statistic('kappa'), 0.711); 180is($result->get_statistic('entropy'),1.31 ); 181is($result->get_statistic('T'), 0); 182is($result->get_statistic('A'), 30); 183is($result->get_statistic('X1'), '6'); 184is($result->get_statistic('X1_bits'), 11.9); 185is($result->get_statistic('X2'), 15); 186is($result->get_statistic('X2_bits'), 29.7); 187is($result->get_statistic('S1'), 12); 188is($result->get_statistic('S1_bits'), 24.3); 189is($result->get_statistic('S2'), 17); 190is($result->get_statistic('S2_bits'), 34.2); 191 192is($result->get_statistic('dbentries'), 1083200); 193 194@valid = ( [ 'gb|AY052359.1|', 2826, 'AY052359', '3e-18', 96, 1, 60, 195 '1.0000'], 196 [ 'gb|AC002329.2|AC002329', 76170, 'AC002329', '3e-18', 96, 1, 60, 197 '1.0000' ], 198 [ 'gb|AF132318.1|AF132318', 5383, 'AF132318', '0.04', 42, 35, 55, 199 '0.3500']); 200$count = 0; 201 202while( my $hit = $result->next_hit ) { 203 my $d = shift @valid; 204 is($hit->name, shift @$d); 205 is($hit->length, shift @$d); 206 is($hit->accession, shift @$d); 207 float_is($hit->significance, shift @$d); 208 is($hit->raw_score, shift @$d ); 209 is($hit->start, shift @$d); 210 is($hit->end,shift @$d); 211 is(sprintf("%.4f",$hit->frac_aligned_query), shift @$d); 212 if( $count == 0 ) { 213 my $hsps_left = 1; 214 while( my $hsp = $hit->next_hsp ) { 215 is($hsp->query->start, 1); 216 is($hsp->query->end, 60); 217 is($hsp->query->strand, 1); 218 is($hsp->hit->start, 154); 219 is($hsp->hit->end, 212); 220 is($hsp->hit->strand, 1); 221 is($hsp->length('hsp'), 60); 222 float_is($hsp->evalue, 3e-18); 223 is($hsp->score, 48); 224 is($hsp->bits,95.6); 225 is(sprintf("%.2f",$hsp->percent_identity), 96.67); 226 is(sprintf("%.4f",$hsp->frac_identical('query')), 0.9667); 227 is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.9831); 228 is($hsp->query->frame(), 0); 229 is($hsp->hit->frame(), 0); 230 is($hsp->query->seq_id, ''); 231 is($hsp->hit->seq_id, 'gb|AY052359.1|'); 232 is($hsp->gaps('query'), 0); 233 is($hsp->gaps('hit'), 1); 234 is($hsp->gaps, 1); 235 is($hsp->query_string, 'aggaatgctgtttaattggaatcgtacaatggagaatttgacggaaatagaatcaacgat'); 236 is($hsp->hit_string, 'aggaatgctgtttaattggaatca-acaatggagaatttgacggaaatagaatcaacgat'); 237 is($hsp->homology_string, '||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||'); 238 is(sprintf("%.2f",$hsp->get_aln->overall_percentage_identity), 96.67); 239 is(sprintf("%.2f",$hsp->get_aln->percentage_identity), 98.31); 240 $hsps_left--; 241 } 242 is($hsps_left, 0); 243 } 244 last if( $count++ > @valid ); 245} 246is(@valid, 0); 247 248$searchio = Bio::SearchIO->new('-format' => 'blast_pull', 249 '-file' => test_input_file('frac_problems.blast')); 250my @expected = ("1.000", "0.943"); 251while (my $result = $searchio->next_result) { 252 my $hit = $result->next_hit; 253 if (@expected == 2) { 254 is($hit->frac_identical, shift @expected); 255 } 256 else { 257 TODO: { 258 local $TODO = 'frac_identical failing!'; 259 is($hit->frac_identical, shift @expected); 260 } 261 } 262} 263is(@expected, 0); 264 265# And even more: frac_aligned_query should never be over 1! 266$searchio = Bio::SearchIO->new('-format' => 'blast_pull', 267 '-file' => test_input_file('frac_problems2.blast')); 268$result = $searchio->next_result; 269$hit = $result->next_hit; 270is $hit->frac_aligned_query, 0.97; 271 272# Also, start and end should be sane 273$searchio = Bio::SearchIO->new('-format' => 'blast_pull', 274 '-file' => test_input_file('frac_problems3.blast')); 275$result = $searchio->next_result; 276$hit = $result->next_hit; 277is $hit->start('sbjct'), 207; 278is $hit->end('sbjct'), 1051; 279 280# Do a multiblast report test 281$searchio = Bio::SearchIO->new('-format' => 'blast_pull', 282 '-file' => test_input_file('multi_blast.bls')); 283 284@expected = qw(CATH_RAT CATL_HUMAN CATL_RAT PAPA_CARPA); 285my $results_left = 4; 286while( my $result = $searchio->next_result ) { 287 is($result->query_name, shift @expected, "Multiblast query test"); 288 $results_left--; 289} 290is($results_left, 0); 291 292# Web blast result parsing 293$searchio = Bio::SearchIO->new(-format => 'blast_pull', 294 -file => test_input_file('catalase-webblast.BLASTP')); 295ok($result = $searchio->next_result); 296ok($hit = $result->next_hit); 297is($hit->name, 'gi|40747822|gb|EAA66978.1|', 'full hit name'); 298is($hit->accession, 'EAA66978', 'hit accession'); 299ok(my $hsp = $hit->next_hsp); 300is($hsp->query->start, 1, 'query start'); 301is($hsp->query->end, 528, 'query start'); 302 303# tests for new BLAST 2.2.13 output 304$searchio = Bio::SearchIO->new(-format => 'blast_pull', 305 -file => test_input_file('new_blastn.txt')); 306 307$result = $searchio->next_result; 308is($result->database_name, 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences)'); 309is($result->database_entries, 3742891); 310is($result->database_letters, 16670205594); 311is($result->algorithm, 'BLASTN'); 312is($result->algorithm_version, '2.2.13 [Nov-27-2005]'); 313is($result->query_name, 'pyrR,'); 314is($result->query_length, 558); 315is($result->get_statistic('kappa'), '0.711'); 316is($result->get_statistic('kappa_gapped'), '0.711'); 317is($result->get_statistic('lambda'), '1.37'); 318is($result->get_statistic('lambda_gapped'), '1.37'); 319is($result->get_statistic('entropy'), '1.31'); 320is($result->get_statistic('entropy_gapped'), '1.31'); 321is($result->get_statistic('dbletters'), '-509663586'); 322is($result->get_statistic('dbentries'), 3742891); 323is($result->get_statistic('effective_hsplength'), undef); 324is($result->get_statistic('effectivespace'), 8935230198384); 325is($result->get_parameter('matrix'), 'blastn matrix:1 -3'); 326is($result->get_parameter('gapopen'), 5); 327is($result->get_parameter('gapext'), 2); 328is($result->get_statistic('S2'), '60'); 329is($result->get_statistic('S2_bits'), '119.4'); 330float_is($result->get_parameter('expect'), '1e-23'); 331is($result->get_statistic('num_extensions'), '117843'); 332 333@valid = ( [ 'gi|41400296|gb|AE016958.1|', 4829781, 'AE016958', '6e-059', 236], 334 [ 'gi|54013472|dbj|AP006618.1|', 6021225, 'AP006618', '4e-026', 127], 335 [ 'gi|57546753|dbj|BA000030.2|', 9025608, 'BA000030', '1e-023', 119]); 336$count = 0; 337while( $hit = $result->next_hit ) { 338 my $d = shift @valid; 339 340 is($hit->name, shift @$d); 341 is($hit->length, shift @$d); 342 is($hit->accession, shift @$d); 343 float_is($hit->significance, shift @$d); 344 is($hit->raw_score, shift @$d ); 345 346 if( $count == 0 ) { 347 my $hsps_left = 1; 348 while( my $hsp = $hit->next_hsp ) { 349 is($hsp->query->start, 262); 350 is($hsp->query->end, 552); 351 is($hsp->hit->start, 1166897); 352 is($hsp->hit->end, 1167187); 353 is($hsp->length('hsp'), 291); 354 is($hsp->start('hit'), $hsp->hit->start); 355 is($hsp->end('query'), $hsp->query->end); 356 is($hsp->strand('sbjct'), $hsp->subject->strand);# alias for hit 357 float_is($hsp->evalue, 6e-59); 358 is($hsp->score, 119); 359 is($hsp->bits,236); 360 is(sprintf("%.2f",$hsp->percent_identity), 85.22); 361 is(sprintf("%.4f",$hsp->frac_identical('query')), 0.8522); 362 is(sprintf("%.4f",$hsp->frac_identical('hit')), 0.8522); 363 is($hsp->gaps, 0); 364 $hsps_left--; 365 } 366 is($hsps_left, 0); 367 } 368 last if( $count++ > @valid ); 369} 370is(@valid, 0); 371 372# Bug 2189 373$searchio = Bio::SearchIO->new(-format => 'blast_pull', 374 -file => test_input_file('blastp2215.blast')); 375 376$result = $searchio->next_result; 377is($result->database_entries, 4460989); 378is($result->database_letters, 1533424333); 379is($result->algorithm, 'BLASTP'); 380is($result->algorithm_version, '2.2.15 [Oct-15-2006]'); 381is($result->query_name, 'gi|15608519|ref|NP_215895.1|'); 382is($result->query_length, 193); 383my @hits = $result->hits; 384is(scalar(@hits), 10); 385is($hits[1]->accession,'1W30'); 386float_is($hits[4]->significance,'2e-72'); 387is($hits[7]->score,'254'); 388$result = $searchio->next_result; 389is($result->database_entries, 4460989); 390is($result->database_letters, 1533424333); 391is($result->algorithm, 'BLASTP'); 392is($result->algorithm_version, '2.2.15 [Oct-15-2006]'); 393is($result->query_name, 'gi|15595598|ref|NP_249092.1|'); 394is($result->query_length, 423); 395@hits = $result->hits; 396is(scalar(@hits), 10); 397is($hits[1]->accession,'ZP_00972546'); 398float_is($hits[4]->significance, '0.0'); 399is($hits[7]->score, 624); 400 401# Bug 2246 402$searchio = Bio::SearchIO->new(-format => 'blast_pull', 403 -verbose => -1, 404 -file => test_input_file('bug2246.blast')); 405$result = $searchio->next_result; 406$hit = $result->next_hit; 407is $hit->name, 'UniRef50_Q9X0H5'; 408is $hit->length, undef; 409is $hit->accession, 'UniRef50_Q9X0H5'; 410is $hit->description, 'Cluster: Histidyl-tRNA synthetase; n=4; Thermoto...'; 411is $hit->raw_score, 23; 412float_is($hit->significance, 650); 413 414#*** need to /fully/ test a multi-result, multi-hit, multi-hsp report! 415