1# -*-Perl-*- Test Harness script for Bioperl
2# $Id$
3
4use strict;
5
6BEGIN {
7    use Bio::Root::Test;
8
9    test_begin(-tests           => 561,
10               -requires_module => 'Data::Stag');
11
12    use_ok('Bio::SeqIO');
13}
14
15
16my $verbose = test_debug();
17
18################################## GenBank ##################################
19
20my $ast = Bio::SeqIO->new(-format => 'gbdriver' ,
21                        -verbose => $verbose,
22                        -file => test_input_file("roa1.genbank"));
23$ast->verbose($verbose);
24my $as = $ast->next_seq();
25is $as->molecule, 'mRNA',$as->accession_number;
26is $as->alphabet, 'dna';
27is($as->primary_id, 3598416);
28my @class = $as->species->classification;
29is $class[$#class],'Eukaryota';
30
31$ast = Bio::SeqIO->new(-format => 'gbdriver',
32                              -verbose => $verbose,
33                       -file => test_input_file("NT_021877.gbk"));
34$ast->verbose($verbose);
35$as = $ast->next_seq();
36is $as->molecule, 'DNA',$as->accession_number;
37is $as->alphabet, 'dna';
38is($as->primary_id, 37539616);
39is($as->accession_number, 'NT_021877');
40
41my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
42is(($cds->get_tag_values('transl_except'))[1],
43   '(pos:complement(4224..4226),aa:OTHER)');
44
45# test for a DBSOURCE line
46$ast = Bio::SeqIO->new(-format => 'gbdriver',
47                              -verbose => $verbose,
48                       -file => test_input_file("BAB68554.gb"));
49$ast->verbose($verbose);
50$as = $ast->next_seq();
51is $as->molecule, 'linear',$as->accession_number;;
52is $as->alphabet, 'protein';
53# Though older GenBank releases indicate SOURCE contains only the common name,
54# this is no longer true.  In general, this line will contain an abbreviated
55# form of the full organism name (but may contain the full length name),
56# as well as the optional common name and organelle.  There is no get/set
57# for the abbreviated name but it is accessible via name()
58ok defined($as->species->name('abbreviated')->[0]);
59is $as->species->name('abbreviated')->[0], 'Aldabra giant tortoise';
60is($as->primary_id, 15824047);
61my $ac = $as->annotation;
62ok defined $ac;
63my @dblinks = $ac->get_Annotations('dblink');
64is(scalar @dblinks,1);
65is($dblinks[0]->database, 'GenBank');
66is($dblinks[0]->primary_id, 'AB072353');
67is($dblinks[0]->version, '1');
68is($dblinks[0]->display_text, 'GenBank:AB072353.1','operator overloading in AnnotationI is deprecated');
69
70# test for multi-line SOURCE
71$ast = Bio::SeqIO->new(-format => 'gbdriver',
72                              -verbose => $verbose,
73                       -file => test_input_file("NC_006346.gb"));
74$as = $ast->next_seq;
75is $as->species->binomial('FULL'), 'Bolitoglossa n. sp. RLM-2004',$as->accession_number;;
76@class = $as->species->classification;
77is($class[$#class],'Eukaryota');
78is($as->species->common_name,'mushroomtongue salamander');
79
80$ast = Bio::SeqIO->new(-format => 'gbdriver',
81                              -verbose => $verbose,
82                       -file => test_input_file("U71225.gb"));
83$as = $ast->next_seq;
84@class = $as->species->classification;
85is($class[$#class],'Eukaryota',$as->accession_number);
86is $as->species->common_name,'black-bellied salamander';
87
88# test for unusual common name
89$ast = Bio::SeqIO->new(-format => 'gbdriver',
90                              -verbose => $verbose,
91                       -file => test_input_file("AB077698.gb"));
92$as = $ast->next_seq;
93# again, this is not a common name but is in name('abbreviated')
94ok defined($as->species->name('abbreviated')->[0]),$as->accession_number;
95is $as->species->name('abbreviated')->[0],'Homo sapiens cDNA to mRNA';
96
97# test for common name with parentheses
98$ast = Bio::SeqIO->new(-format => 'gbdriver',
99                              -verbose => $verbose,
100                       -file => test_input_file("DQ018368.gb"));
101$as = $ast->next_seq;
102is $as->species->scientific_name,'(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata',
103$as->accession_number;;
104
105# test secondary accessions
106my $seqio = Bio::SeqIO->new(-format => 'gbdriver',
107                                    -verbose => $verbose,
108                                    -file => test_input_file('D10483.gbk'));
109my $seq = $seqio->next_seq;
110my @kw =  $seq->get_keywords;
111is(scalar @kw, 118, $seq->accession_number);
112is($kw[-1], 'yabO');
113my @sec_acc = $seq->get_secondary_accessions();
114is(scalar @sec_acc,14);
115is($sec_acc[-1], 'X56742');
116
117# bug #1487
118my $str = Bio::SeqIO->new(-verbose => $verbose,
119                                 -file    => test_input_file('D12555.gbk'));
120eval {
121    $seq = $str->next_seq;
122};
123
124ok(! $@, 'bug 1487');
125
126# bug 1647 rpt_unit sub-feature with multiple parens
127$str = Bio::SeqIO->new(-format => 'gbdriver',
128                              -verbose => $verbose,
129                       -file => test_input_file('mini-AE001405.gb'));
130ok($seq = $str->next_seq);
131my @rpts = grep { $_->primary_tag eq 'repeat_region' }
132  $seq->get_SeqFeatures;
133is $#rpts, 2, 'bug 1647';
134my @rpt_units = grep {$_->has_tag('rpt_unit')} @rpts;
135is $#rpt_units, 0;
136is(($rpt_units[0]->get_tag_values('rpt_unit'))[0],'(TG)10;A;(TG)7');
137
138# test bug #1673 , RDB-II genbank files
139$str = Bio::SeqIO->new(-format => 'gbdriver',
140                              -verbose => $verbose,
141                       -file => test_input_file('Mcjanrna_rdbII.gbk')
142              );
143ok($seq = $str->next_seq, 'bug 1673');
144my @refs = $seq->annotation->get_Annotations('reference');
145is(@refs, 1);
146is($seq->display_id,'Mc.janrrnA');
147is($seq->molecule ,'RNA');
148
149$str  = Bio::SeqIO->new(-format => 'gbdriver',
150                              -file   => test_input_file("AF165282.gb"),
151                              -verbose => $verbose);
152$seq = $str->next_seq;
153my @features = $seq->all_SeqFeatures();
154is(@features, 5, $seq->accession_number);
155is($features[0]->start, 1);
156is($features[0]->end, 226);
157my $location = $features[1]->location;
158ok($location->isa('Bio::Location::SplitLocationI'));
159my @sublocs = $location->sub_Location();
160is(@sublocs, 29);
161
162# version and primary ID - believe it or not, this wasn't working
163is ($seq->version, 1);
164is ($seq->seq_version, 1);
165is ($seq->primary_id, "5734104");
166
167# streaming and Bio::RichSeq creation
168my $stream = Bio::SeqIO->new(-file => test_input_file("test.genbank"),
169                                      -verbose => $verbose,
170                             -format => 'gbdriver');
171$stream->verbose($verbose);
172my $seqnum = 0;
173my $species;
174my @cl;
175my $lasts;
176my @ids = qw(DDU63596 DDU63595 HUMBDNF);
177my @tids = (44689, 44689, 9606);
178my @tnames = ("Dictyostelium discoideum","Dictyostelium discoideum",
179                  "Homo sapiens");
180while($seq = $stream->next_seq()) {
181    if($seqnum < 3) {
182        is $seq->display_id(), $ids[$seqnum];
183        $species = $seq->species();
184        @cl = $species->classification();
185        is( $species->binomial(), $tnames[$seqnum],
186             'species parsing incorrect for genbank');
187        is( $cl[3] ne $species->genus(), 1,
188             'genus duplicated in genbank parsing');
189        is( $species->ncbi_taxid, $tids[$seqnum] );
190    }
191    $seqnum++;
192    $lasts = $seq;
193}
194is($seqnum, 5,'streaming');
195is $lasts->display_id(), "HUMBETGLOA";
196my ($ref) = $lasts->annotation->get_Annotations('reference');
197is($ref->medline, 94173918);
198$stream->close();
199
200$stream = Bio::SeqIO->new(-file => test_input_file("test.genbank.noseq"),
201                                  -verbose => $verbose,
202                                  -format => 'gbdriver' );
203$seqnum = 0;
204while($seq = $stream->next_seq()) {
205    if($seqnum < 3) {
206        is $seq->display_id(), $ids[$seqnum];
207    } elsif( $seq->display_id eq 'M37762') {
208        is( ($seq->get_keywords())[0], 'neurotrophic factor');
209    }
210    $seqnum++;
211}
212is $seqnum, 5, "Total number of sequences in test file";
213
214# fuzzy
215$seq = Bio::SeqIO->new( -format => 'gbdriver',
216                                -verbose => $verbose,
217                        -file =>test_input_file("testfuzzy.genbank"));
218$seq->verbose($verbose);
219ok(defined($as = $seq->next_seq()));
220
221@features = $as->all_SeqFeatures();
222is(@features,21,'Fuzzy in');
223my $lastfeature = pop @features;
224# this is a split location; the root doesn't have strand
225is($lastfeature->strand, undef);
226$location = $lastfeature->location;
227#$location->verbose(-1); # silence the warning of undef seq_id()
228# see above; splitlocs roots do not have a strand really
229is($location->strand, undef);
230is($location->start, 83202);
231is($location->end, 84996);
232
233@sublocs = $location->sub_Location();
234
235is(@sublocs, 2);
236my $loc = shift @sublocs;
237is($loc->start, 83202);
238is($loc->end, 83329);
239is($loc->strand, -1);
240
241$loc = shift @sublocs;
242is($loc->start, 84248);
243is($loc->end, 84996);
244is($loc->strand,1);
245
246my $outfile = test_output_file();
247$seq = Bio::SeqIO->new(-format => 'genbank',
248                              -verbose => $verbose,
249                       -file=> ">$outfile");
250$seq->verbose($verbose);
251ok($seq->write_seq($as),'Fuzzy out');
252
253## now genbank ##
254$str = Bio::SeqIO->new(-format =>'gbdriver',
255                             -verbose => $verbose,
256                             -file => test_input_file('BK000016-tpa.gbk'));
257$seq = $str->next_seq;
258ok(defined $seq, $seq->accession_number);
259ok(defined $seq->seq);
260is($seq->accession_number, 'BK000016',$seq->accession_number);
261is($seq->alphabet, 'dna');
262is($seq->display_id, 'BK000016');
263is($seq->length, 1162);
264is($seq->division, 'ROD');
265is($seq->get_dates, 1);
266is($seq->keywords, 'Third Party Annotation; TPA');
267is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
268is($seq->seq_version, 1);
269is($seq->feature_count, 2);
270my $spec_obj = $seq->species;
271is ($spec_obj->common_name, 'house mouse');
272is ($spec_obj->species, 'musculus');
273is ($spec_obj->genus, 'Mus');
274is ($spec_obj->binomial, 'Mus musculus');
275$ac = $seq->annotation;
276my $reference =  ($ac->get_Annotations('reference') )[0];
277is ($reference->pubmed, '11479594');
278is ($reference->medline, '21372465',$seq->accession_number);
279
280# validate that what is written is what is read
281my $testfile = test_output_file();
282my $out = Bio::SeqIO->new(-file => ">$testfile",
283                             -format => 'genbank');
284$out->write_seq($seq);
285$out->close();
286
287$str = Bio::SeqIO->new(-format =>'gbdriver',
288                             -file => $testfile);
289$seq = $str->next_seq;
290ok(defined $seq,'roundtrip');
291ok(defined $seq->seq);
292is($seq->accession_number, 'BK000016');
293is($seq->alphabet, 'dna');
294is($seq->display_id, 'BK000016');
295is($seq->length, 1162);
296is($seq->division, 'ROD');
297is($seq->get_dates, 1);
298is($seq->keywords, 'Third Party Annotation; TPA');
299is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
300is($seq->seq_version, 1);
301is($seq->feature_count, 2);
302$spec_obj = $seq->species;
303is ($spec_obj->common_name, 'house mouse');
304is ($spec_obj->species, 'musculus');
305is ($spec_obj->genus, 'Mus');
306is ($spec_obj->binomial, 'Mus musculus');
307$ac = $seq->annotation;
308$reference =  ($ac->get_Annotations('reference') )[0];
309is ($reference->pubmed, '11479594');
310is ($reference->medline, '21372465');
311
312# write revcomp split location
313my $gb = Bio::SeqIO->new(-format => 'gbdriver',
314                        -verbose => $verbose,
315                        -file   => test_input_file('revcomp_mrna.gb'));
316$seq = $gb->next_seq();
317
318$gb = Bio::SeqIO->new(-format => 'genbank',
319                     -file   => ">$testfile");
320
321$gb->write_seq($seq);
322undef $gb;
323ok(! -z $testfile, 'revcomp split location');
324
325# bug 1925, continuation of long ORGANISM line ends up in @classification:
326# ORGANISM  Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC
327#           9150
328#           Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
329#           Enterobacteriaceae; Salmonella.
330$gb = Bio::SeqIO->new(-format => 'gbdriver',
331                     -verbose => $verbose,
332                        -file   => test_input_file('NC_006511-short.gbk'));
333$seq = $gb->next_seq;
334is $seq->species->common_name, undef, "Bug 1925";
335is $seq->species->scientific_name, "Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150";
336@class = $seq->species->classification;
337is $class[$#class], "Bacteria";
338
339# WGS tests
340$gb = Bio::SeqIO->new(-format => 'gbdriver',
341                      -verbose => $verbose,
342                    -file   => test_input_file('O_sat.wgs'));
343$seq = $gb->next_seq;
344
345my @tests = ('wgs'        => 'AAAA02000001-AAAA02050231',
346            'wgs_scafld' => 'CM000126-CM000137',
347            'wgs_scafld' => 'CH398081-CH401163');
348
349my @wgs = map {$seq->annotation->get_Annotations(lc($_))} (qw(WGS WGS_SCAFLD));
350
351my $ct=0;
352
353for my $wgs (@wgs) {
354    my ($tagname, $value) = (shift @tests, shift @tests);
355    is($wgs->tagname, $tagname, $tagname);
356    is($wgs->value, $value);
357    $ct++;
358}
359
360is ($ct, 3);
361
362# make sure we can retrieve a feature with a primary tag of 'misc_difference'
363$gb = Bio::SeqIO->new(-format => 'gbdriver',
364                     -verbose => $verbose,
365                    -file   => test_input_file('BC000007.gbk'));
366$seq = $gb->next_seq;
367($cds) = grep { $_->primary_tag eq 'misc_difference' } $seq->get_SeqFeatures;
368my @vals = $cds->get_tag_values('gene');
369is $vals[0], 'PX19', $seq->accession_number;
370
371# Check that the source,organism section is identical between input and output.
372# - test an easy one where organism is species, then two different formats of
373# subspecies, then a species with a format that used to be mistaken for
374# subspecies, then a bacteria with no genus, and finally a virus with a genus.
375
376# These tests are now somewhat out-of-date since we are moving to a Bio::Taxon-
377# based system for verifying taxonomic information.  Right now they just verify
378# changes so are really useless; I will change them to verify common name,
379# organelle, scientific name, etc.
380
381
382# output always adds a period (GenBank std), but two of these files do not use them.
383
384foreach my $in ('BK000016-tpa.gbk', 'ay116458.gb', 'ay149291.gb', 'NC_006346.gb', 'ay007676.gb', 'dq519393.gb') {
385    my $infile =  test_input_file($in);
386	$outfile = test_output_file();
387
388    $str = Bio::SeqIO->new(-format =>'genbank',
389                          -verbose => $verbose,
390                          -file => $infile);
391    $seq = $str->next_seq;
392
393    $out = Bio::SeqIO->new(-file => $outfile, -format => 'genbank');
394    $out->write_seq($seq);
395    $out->close();
396
397    open my $IN, '<', $infile or die "Could not read file '$infile': $!\n";
398    my @in = <$IN>;
399    close $IN;
400    open my $RESULT, '<', $outfile or die "Could not read file '$outfile': $!\n";
401    my $line = 0;
402    my $check = 0;
403    my $is = 1;
404
405    FILECHECK:
406    while (my $result = <$RESULT>) {
407        if ($result =~ /^KEYWORDS/) {
408            $check = 1;
409            next;
410        }
411
412        if ($result =~ /^REFERENCE/) {
413            last FILECHECK;
414        }
415
416        if ($check) {
417
418            # end periods don't count (not all input files have them)
419            $result =~ s{\.$}{};
420            $in[$line] =~ s{\.$}{};
421
422            if ($result ne $in[$line]) {
423                $is = 0;
424                last;
425            }
426        }
427    } continue { $line++ }
428    close $RESULT;
429
430    ok $is, $in;
431}
432
433# NB: there should probably be full testing on all lines to ensure that output
434# matches input.
435
436# 20061117: problem with *double* colon in some annotation-dblink values
437$ct = 0;
438
439foreach my $in ('P35527.gb') {
440    my $infile =  test_input_file($in);
441    $str = Bio::SeqIO->new(-format =>'genbank',
442                         -verbose => $verbose,
443                         -file => $infile);
444    $seq = $str->next_seq;
445    my $ac      = $seq->annotation();      # Bio::AnnotationCollection
446    foreach my $key ($ac->get_all_annotation_keys() ) {
447        my @values = $ac->get_Annotations($key);
448        foreach my $ann (@values) {
449            my $value = $ann->display_text;
450            $ct++;
451            if ($key eq 'dblink') {
452
453                ok (index($value,'::') < 0);   # this should never be true
454
455                ok ($value, $value);   # check value is not empty
456
457                #  print "  ann/", sprintf('%12s  ',$key), '>>>', $value , '<<<', "\n";
458                #  print "        index double colon: ",index($value   ,'::'), "\n";
459
460                #  check db name:
461                my @parts = split(/:/,$value);
462                if ( $parts[0] =~ /^(?:
463                        #  not an exhaustive list of databases;
464                        #  just the db's referenced in P35527.gb:
465                        swissprot | GenBank | GenPept  | HSSP| IntAct | Ensembl | KEGG | HGNC | MIM | ArrayExpress
466                                  | GO      | InterPro | Pfam| PRINTS | PROSITE
467                                     )$/x )
468                {
469                    ok 1;
470                }
471                else {
472                    ok 0;
473                }
474                    ok ( $parts[1], "$parts[0]" );
475            }
476                # elsif ($key eq 'reference') { }
477        }
478    }
479}
480
481is($ct, 46);
482
483# bug 2195
484
485$str = Bio::SeqIO->new(-format =>'gbdriver',
486                      -verbose => $verbose,
487                      -file => test_input_file('AF305198.gb')
488                     );
489
490$species = $str->next_seq->species;
491
492is($species->scientific_name, 'Virginia creeper phytoplasma', 'Bug 2195');
493is(join(', ',$species->classification), 'Virginia creeper phytoplasma, '.
494   '16SrV (Elm yellows group), Candidatus Phytoplasma, '.
495   'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '.
496   'Firmicutes, Bacteria', 'Bug 2195');
497
498# bug 2569, PROJECT line support, read and write, round-tripping
499
500$str = Bio::SeqIO->new(-format =>'gbdriver',
501                      -verbose => $verbose,
502                      -file => test_input_file('NC_008536.gb'));
503
504$seq = $str->next_seq;
505
506my $project = ($seq->annotation->get_Annotations('project'))[0];
507
508isa_ok($project, 'Bio::Annotation::SimpleValue');
509
510if ($project) {
511	is($project->value, 'GenomeProject:12638');
512} else {
513	ok(0, "PROJECT not parsed");
514}
515
516$outfile = test_output_file();
517
518$gb = Bio::SeqIO->new(-format => 'genbank',
519                              -verbose => $verbose,
520                       -file=> ">$outfile");
521
522$gb->write_seq($seq);
523
524$str = Bio::SeqIO->new(-format =>'gbdriver',
525                      -verbose => $verbose,
526                      -file => $outfile);
527
528$seq = $str->next_seq;
529
530$project = ($seq->annotation->get_Annotations('project'))[0];
531
532isa_ok($project, 'Bio::Annotation::SimpleValue');
533
534if ($project) {
535	is($project->value, 'GenomeProject:12638');
536} else {
537	ok(0, "Roundtrip test failed");
538}
539
540################################## EMBL ##################################
541
542# Set to -1 for release version, so warnings aren't printed
543
544$ast = Bio::SeqIO->new( -format => 'embldriver',
545			   -verbose => $verbose,
546			   -file => test_input_file("roa1.dat"));
547$ast->verbose($verbose);
548$as = $ast->next_seq();
549ok defined $as->seq;
550is($as->display_id, 'HSHNCPA1');
551is($as->accession_number, 'X79536');
552is($as->seq_version, 1);
553is($as->version, 1);
554is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1');
555is($as->molecule, 'RNA');
556is($as->alphabet, 'rna');
557is(scalar $as->all_SeqFeatures(), 4);
558is($as->length, 1198);
559is($as->species->binomial(), 'Homo sapiens');
560is($as->get_dates, 2);
561
562# EMBL Release 87 changes (8-17-06)
563
564$ast = Bio::SeqIO->new( -format => 'embldriver',
565			   -verbose => $verbose,
566			   -file => test_input_file("roa1_v2.dat"));
567$ast->verbose($verbose);
568$as = $ast->next_seq();
569ok defined $as->seq;
570# accession # same as display name now
571is($as->display_id, 'X79536');
572is($as->get_dates, 2);
573is($as->accession_number, 'X79536');
574is($as->seq_version, 1);
575is($as->version, 1);
576is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1');
577# mRNA instead of RNA
578is($as->molecule, 'mRNA');
579is($as->alphabet, 'rna');
580is(scalar $as->all_SeqFeatures(), 4);
581is($as->length, 1198);
582is($as->species->binomial(), 'Homo sapiens');
583
584my $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"),
585			   -format => 'embldriver');
586$seq = $ent->next_seq();
587
588is(defined $seq->seq(), 1,
589   'success reading Embl with ^ location and badly split double quotes');
590is(scalar $seq->annotation->get_Annotations('reference'), 3);
591is($seq->get_dates, 0);
592
593$out = Bio::SeqIO->new(-file=> ">$outfile",
594			  -format => 'embl');
595is($out->write_seq($seq),1,
596   'success writing Embl format with ^ < and > locations');
597
598# embl with no FT
599$ent = Bio::SeqIO->new( -file => test_input_file("test.embl"),
600			-format => 'embldriver');
601$seq = $ent->next_seq();
602
603ok($seq);
604is(lc($seq->subseq(1,10)),'gatcagtaga');
605is($seq->length, 4870);
606
607# embl with no FH
608my $noFH = Bio::SeqIO->new(-file => test_input_file("no_FH.embl"),
609			-format => 'embldriver');
610$seq = $noFH->next_seq;
611is(scalar($seq->get_SeqFeatures), 4);
612is($seq->display_id, 'AE000001');
613is($seq->get_dates, 0);
614
615# bug 1571
616$ent = Bio::SeqIO->new(-format => 'embldriver',
617		       -file   => test_input_file('test.embl2sq'));
618$seq = $ent->next_seq;
619is($seq->length,4870);
620is($seq->get_dates, 0);
621
622# embl repbase
623$ent = Bio::SeqIO->new(-file => test_input_file("BEL16-LTR_AG.embl"), -format => 'embldriver');
624$seq = $ent->next_seq;
625is($seq->display_id,'BEL16-LTR_AG');
626is($seq->get_dates, 2);
627
628# test secondary accessions in EMBL (bug #1332)
629$seqio = Bio::SeqIO->new(-format => 'embldriver',
630			   -file => test_input_file('ECAPAH02.embl'));
631$seq = $seqio->next_seq;
632is($seq->accession_number, 'D10483');
633is($seq->seq_version, 2);
634my @accs = $seq->get_secondary_accessions();
635is($accs[0], 'J01597');
636is($accs[-1], 'X56742');
637is($seq->get_dates, 2);
638
639### TPA TESTS - Thanks to Richard Adams ###
640# test Third Party Annotation entries in EMBL/Gb format
641# to ensure compatability with parsers.
642$str = Bio::SeqIO->new(-verbose => $verbose,
643                         -format =>'embldriver',
644			 -file => test_input_file('BN000066-tpa.embl'));
645$seq = $str->next_seq;
646ok(defined $seq);
647is($seq->accession_number, 'BN000066');
648is($seq->alphabet, 'dna');
649is($seq->display_id, 'AGA000066');
650is($seq->length, 5195);
651is($seq->division, 'INV');
652is($seq->keywords, 'acetylcholinesterase; achE1 gene; Third Party Annotation; TPA');
653is($seq->seq_version, 1);
654is($seq->feature_count, 15);
655is($seq->get_dates, 2);
656
657$spec_obj = $seq->species;
658is ($spec_obj->common_name, 'African malaria mosquito');
659is ($spec_obj->species, 'gambiae');
660is ($spec_obj->genus, 'Anopheles');
661is ($spec_obj->binomial, 'Anopheles gambiae');
662
663$ac = $seq->annotation;
664$reference =  ($ac->get_Annotations('reference') )[1];
665is ($reference->title,'"A novel acetylcholinesterase gene in mosquitoes codes for the insecticide target and is non-homologous to the ace gene in Drosophila"');
666is ($reference->authors,'Weill M., Fort P., Berthomi eu A., Dubois M.P., Pasteur N., Raymond M.');
667my $cmmnt =  ($ac->get_Annotations('comment') )[0];
668is($cmmnt->text, 'see also AJ488492 for achE-1 from Kisumu strain Third Party Annotation Database: This TPA record uses Anopheles gambiae trace archive data (http://trace.ensembl.org)');
669
670
671$ent = Bio::SeqIO->new( -file => test_input_file("test.embl"),
672                        -format => 'embldriver');
673$ent->verbose($verbose);
674$seq = $ent->next_seq();
675$species = $seq->species();
676@cl = $species->classification();
677is( $cl[3] ne $species->genus(), 1, 'genus duplication test');
678$ent->close();
679
680#
681## read-write - test embl writing of a PrimarySeq
682#
683my $primaryseq = Bio::PrimarySeq->new( -seq => 'AGAGAGAGATA',
684                                      -id  => 'myid',
685                                      -desc => 'mydescr',
686                                      -alphabet => 'DNA',
687                                      -accession_number => 'myaccession');
688
689$verbose = -1 unless $ENV{'BIOPERLDEBUG'};  # silence warnings unless we are debuggin
690
691my $embl = Bio::SeqIO->new(-format => 'embl',
692                          -verbose => $verbose,
693                          -file => ">$outfile");
694
695ok($embl->write_seq($primaryseq));
696
697# this should generate a warning
698my $scalar = "test";
699eval {
700	$embl->write_seq($scalar);
701};
702ok ($@);
703
704############################## Swiss/UniProt ##############################
705
706$seqio = Bio::SeqIO->new( -verbose => $verbose,
707                                     -format => 'swissdriver',
708                                     -file   => test_input_file('test.swiss'));
709
710isa_ok($seqio, 'Bio::SeqIO');
711$seq = $seqio->next_seq;
712my @gns = $seq->annotation->get_Annotations('gene_name');
713
714$outfile = test_output_file();
715$seqio = Bio::SeqIO->new( -verbose => $verbose,
716                                 -format => 'swiss',
717                                 -file   => ">$outfile");
718
719$seqio->write_seq($seq);
720
721# reads it in once again
722$seqio = Bio::SeqIO->new( -verbose => $verbose,
723                                 -format => 'swissdriver',
724                                 -file => $outfile);
725
726$seq = $seqio->next_seq;
727isa_ok($seq->species, 'Bio::Species');
728is($seq->species->ncbi_taxid, 6239);
729
730# version, seq_update, dates (5 tests)
731is($seq->version, 40);
732my ($ann) = $seq->annotation->get_Annotations('seq_update');
733eval {is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated')};
734ok(!$@);
735
736my @dates = $seq->get_dates;
737my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001);
738
739for my $date (@dates) {
740    my $expdate = shift @date_check;
741    is($date, $expdate,'dates');
742}
743
744my @gns2 = $seq->annotation->get_Annotations('gene_name');
745# check gene name is preserved (was losing suffix in worm gene names)
746ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
747
748# test swissprot multiple RP lines
749$str = Bio::SeqIO->new(-file => test_input_file('P33897'));
750$seq = $str->next_seq;
751isa_ok($seq, 'Bio::Seq::RichSeqI');
752@refs = $seq->annotation->get_Annotations('reference');
753is( @refs, 23);
754is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.');
755
756# version, seq_update, dates (5 tests)
757is($seq->version, 44);
758($ann) = $seq->annotation->get_Annotations('seq_update');
759is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated');
760@dates = $seq->get_dates;
761@date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004);
762for my $date (@dates) {
763    is($date, shift @date_check);
764}
765
766$ast = Bio::SeqIO->new(-verbose => $verbose,
767                                  -format => 'swissdriver' ,
768                                  -file => test_input_file("roa1.swiss"));
769$as = $ast->next_seq();
770
771ok defined $as->seq;
772is($as->id, 'ROA1_HUMAN', "id is ".$as->id);
773like($as->primary_id, qr(Bio::PrimarySeq));
774is($as->length, 371);
775is($as->alphabet, 'protein');
776is($as->division, 'HUMAN');
777is(scalar $as->all_SeqFeatures(), 16);
778is(scalar $as->annotation->get_Annotations('reference'), 11);
779
780# version, seq_update, dates (5 tests)
781is($as->version, 35);
782($ann) = $as->annotation->get_Annotations('seq_update');
783is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated');
784@dates = $as->get_dates;
785@date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997);
786for my $date (@dates) {
787    is($date, shift @date_check);
788}
789
790($ent,$out) = undef;
791($as,$seq) = undef;
792
793$seqio = Bio::SeqIO->new(-format => 'swissdriver' ,
794                                 -verbose => $verbose,
795                                 -file => test_input_file("swiss.dat"));
796$seq = $seqio->next_seq;
797isa_ok($seq, 'Bio::Seq::RichSeqI');
798
799# more tests to verify we are actually parsing correctly
800like($seq->primary_id, qr(Bio::PrimarySeq));
801is($seq->display_id, 'MA32_HUMAN');
802is($seq->length, 282);
803is($seq->division, 'HUMAN');
804is($seq->alphabet, 'protein');
805my @f = $seq->all_SeqFeatures();
806is(@f, 2);
807is($f[1]->primary_tag, 'CHAIN');
808is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN');
809
810# version, seq_update, dates (5 tests)
811is($seq->version, 40);
812($ann) = $seq->annotation->get_Annotations('seq_update');
813is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated');
814@dates = $seq->get_dates;
815@date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000);
816for my $date (@dates) {
817    is($date, shift @date_check);
818}
819
820my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
821($ann) = $seq->annotation->get_Annotations('gene_name');
822# use Data::Stag findval and element name to get values/nodes
823foreach my $gn ( $ann->findval('Name') ) {
824    ok ($gn, shift(@genenames));
825}
826foreach my $gn ( $ann->findval('Synonyms') ) {
827    ok ($gn, shift(@genenames));
828}
829like($ann->value, qr/Name: GC1QBP/);
830
831# test for feature locations like ?..N
832$seq = $seqio->next_seq();
833isa_ok($seq, 'Bio::Seq::RichSeqI');
834like($seq->primary_id, qr(Bio::PrimarySeq));
835is($seq->display_id, 'ACON_CAEEL');
836is($seq->length, 788);
837is($seq->division, 'CAEEL');
838is($seq->alphabet, 'protein');
839is(scalar $seq->all_SeqFeatures(), 5);
840
841foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
842    ok ($gn->value, 'F54H12.1');
843}
844
845# test species in swissprot -- this can be a n:n nightmare
846$seq = $seqio->next_seq();
847isa_ok($seq, 'Bio::Seq::RichSeqI');
848like($seq->primary_id, qr(Bio::PrimarySeq));
849@sec_acc = $seq->get_secondary_accessions();
850is($sec_acc[0], 'P29360');
851is($sec_acc[1], 'Q63631');
852is($seq->accession_number, 'P42655');
853@kw = $seq->get_keywords;
854is( $kw[0], 'Brain');
855is( $kw[1], 'Neurone');
856is($kw[3], 'Multigene family');
857is($seq->display_id, '143E_HUMAN');
858
859# hybrid names from old sequences are no longer valid, these are chopped
860# off at the first organism
861is($seq->species->binomial, "Homo sapiens");
862is($seq->species->common_name, "Human");
863is($seq->species->ncbi_taxid, 9606);
864
865$seq = $seqio->next_seq();
866isa_ok($seq, 'Bio::Seq::RichSeqI');
867like($seq->primary_id, qr(Bio::PrimarySeq));
868is($seq->species->binomial, "Bos taurus");
869is($seq->species->common_name, "Bovine");
870is($seq->species->ncbi_taxid, 9913);
871
872# multiple genes in swissprot
873$seq = $seqio->next_seq();
874isa_ok($seq, 'Bio::Seq::RichSeqI');
875like($seq->primary_id, qr(Bio::PrimarySeq));
876
877($ann) = $seq->annotation->get_Annotations("gene_name");
878@genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
879my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
880
881my @names = @genenames; # copy array
882
883my @ann_names = $ann->get_all_values();
884is(scalar(@ann_names), scalar(@names));
885
886# do this in a layered way (nested tags)
887for my $node ($ann->findnode('gene_name')) {
888    for my $name ($node->findval('Name')) {
889        is($name, shift(@names));
890    }
891    for my $name ($node->findval('Synonyms')) {
892        is($name, shift(@names));
893    }
894}
895
896is(scalar(@names),0);
897
898# same entry as before, but with the new gene names format
899$seqio = Bio::SeqIO->new(-format => 'swissdriver',
900                                 -verbose => $verbose,
901                         -file => test_input_file("calm.swiss"));
902$seq = $seqio->next_seq();
903isa_ok($seq, 'Bio::Seq::RichSeqI');
904like($seq->primary_id, qr(Bio::PrimarySeq));
905
906($ann) = $seq->annotation->get_Annotations("gene_name");
907@names = @genenames; # copy array
908
909my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array
910is(scalar(@ann_names2), scalar(@names));
911
912for my $node ($ann->findnode('gene_name')) {
913    for my $name ($node->findval('Name')) {
914        is($name, shift(@names));
915    }
916    for my $name ($node->findval('Synonyms')) {
917        is($name, shift(@names));
918    }
919}
920
921is(scalar(@names),0);
922
923# test proper parsing of references
924my @litrefs = $seq->annotation->get_Annotations('reference');
925is(scalar(@litrefs), 17);
926
927my @titles = (
928    '"Complete amino acid sequence of human brain calmodulin."',
929    '"Multiple divergent mRNAs code for a single human calmodulin."',
930    '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."',
931    '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."',
932    '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."',
933    undef,
934    '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."',
935    '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."',
936    '"The DNA sequence and analysis of human chromosome 14."',
937    '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."',
938    '"Alpha-helix nucleation by a calcium-binding peptide loop."',
939    '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."',
940    '"Calmodulin structure refined at 1.7 A resolution."',
941    '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."',
942    '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."',
943    '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."',
944    '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."',
945);
946
947my @locs = (
948    "Biochemistry 21:2565-2569(1982).",
949    "J. Biol. Chem. 263:17055-17062(1988).",
950    "J. Biol. Chem. 262:16663-16670(1987).",
951    "Biochem. Int. 9:177-185(1984).",
952    "Eur. J. Biochem. 225:71-82(1994).",
953    "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.",
954    "Cell Calcium 23:323-338(1998).",
955    "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.",
956    "Nature 421:601-607(2003).",
957    "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).",
958    "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).",
959    "Nat. Struct. Biol. 8:990-997(2001).",
960    "J. Mol. Biol. 228:1177-1192(1992).",
961    "Biochemistry 33:15259-15265(1994).",
962    "Nature 415:396-402(2002).",
963    "EMBO J. 21:6721-6732(2002).",
964    "Nat. Struct. Biol. 10:226-231(2003).",
965);
966
967my @positions = (
968     undef, undef,
969    undef, undef,
970    undef, undef,
971    undef, undef,
972    undef, undef,
973    undef, undef,
974    undef, undef,
975    undef, undef,
976    undef, undef,
977    undef, undef,
978    94, 103,
979    1, 76,
980    undef, undef,
981    undef, undef,
982    5, 148,
983    1, 148,
984    undef, undef,
985);
986
987foreach my $litref (@litrefs) {
988    is($litref->title, shift(@titles));
989    is($litref->location, shift(@locs));
990    is($litref->start, shift(@positions));
991    is($litref->end, shift(@positions));
992}
993
994# format parsing changes (pre-rel 9.0)
995
996$seqio = Bio::SeqIO->new( -verbose => $verbose,
997                         -format => 'swissdriver',
998                         -file   => test_input_file('pre_rel9.swiss'));
999
1000ok($seqio);
1001$seq = $seqio->next_seq;
1002isa_ok($seq->species, 'Bio::Species');
1003is($seq->species->ncbi_taxid, "6239");
1004
1005# version, seq_update, dates (5 tests)
1006is($seq->version, 44);
1007($ann) = $seq->annotation->get_Annotations('seq_update');
1008is($ann->display_text, 1);
1009@dates = $seq->get_dates;
1010@date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 );
1011for my $date (@dates) {
1012    is($date, shift @date_check);
1013}
1014
1015my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052
1016		 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
1017		 IPR006092 IPR009075 IPR009100 IPR013764 PF00441
1018		 PF02770 PF02771 PS00072 PS00073);
1019
1020for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
1021    is($dblink->primary_id, shift @idcheck);
1022}
1023
1024$seqio = Bio::SeqIO->new( -verbose => $verbose,
1025                         -format => 'swissdriver',
1026                         -file   => test_input_file('pre_rel9.swiss'));
1027
1028my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
1029
1030while (my $seq = $seqio->next_seq) {
1031    is($seq->namespace, shift @namespaces);
1032}
1033
1034# format parsing changes (rel 9.0, Oct 2006)
1035
1036$seqio = Bio::SeqIO->new( -verbose => $verbose,
1037                         -format => 'swissdriver',
1038                         -file   => test_input_file('rel9.swiss'));
1039
1040ok($seqio);
1041$seq = $seqio->next_seq;
1042isa_ok($seq->species, 'Bio::Species');
1043is($seq->species->ncbi_taxid, 6239);
1044
1045is($seq->version, 47);
1046($ann) = $seq->annotation->get_Annotations('seq_update');
1047is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
1048@dates = $seq->get_dates;
1049@date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 );
1050for my $date (@dates) {
1051    is($date, shift @date_check);
1052}
1053
1054@idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7
1055         WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
1056         IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770
1057         PF02771 PS00072 PS00073 );
1058
1059for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
1060    is($dblink->primary_id, shift @idcheck);
1061}
1062
1063$seqio = Bio::SeqIO->new( -verbose => $verbose,
1064                         -format => 'swissdriver',
1065                         -file   => test_input_file('rel9.swiss'));
1066
1067@namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
1068
1069while (my $seq = $seqio->next_seq) {
1070    is($seq->namespace, shift @namespaces);
1071}
1072
1073# bug 2288
1074# Q8GBD3.swiss
1075$seqio = Bio::SeqIO->new( -verbose => $verbose,
1076                         -format => 'swiss',
1077                         -file   => test_input_file('Q8GBD3.swiss'));
1078
1079while (my $seq = $seqio->next_seq) {
1080    my $lineage = join(';', $seq->species->classification);
1081	is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'.
1082		'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'.
1083		'Proteobacteria;Bacteria');
1084}
1085
1086# test for GenBank swissprot/UniProt/UniProtKB DBSOURCE line (Bug : RT 44536)
1087$ast = Bio::SeqIO->new(-format => 'genbank',
1088                              -verbose => $verbose,
1089                       -file => test_input_file('P39765.gb'));
1090$ast->verbose($verbose);
1091$as = $ast->next_seq();
1092is $as->molecule, 'PRT',$as->accession_number;;
1093is $as->alphabet, 'protein';
1094# Though older GenBank releases indicate SOURCE contains only the common name,
1095# this is no longer true.  In general, this line will contain an abbreviated
1096# form of the full organism name (but may contain the full length name),
1097# as well as the optional common name and organelle.  There is no get/set
1098# for the abbreviated name but it is accessible via name()
1099ok defined($as->species->name('abbreviated')->[0]);
1100is $as->species->name('abbreviated')->[0], 'Bacillus subtilis';
1101is($as->primary_id, 20141743);
1102$ac = $as->annotation;
1103ok defined $ac;
1104@dblinks = $ac->get_Annotations('dblink');
1105is(scalar @dblinks,31);
1106is($dblinks[0]->database, 'UniProtKB');
1107is($dblinks[0]->primary_id, 'PYRR_BACSU');
1108is($dblinks[0]->version, undef);
1109is($dblinks[0]->display_text, 'UniProtKB:PYRR_BACSU','operator overloading in AnnotationI is deprecated');
1110