1package Boulder::Unigene;
2#
3use Boulder::Stream;
4require Exporter;
5@ISA = qw(Exporter Boulder::Stream);
6@EXPORT = ();
7@EXPORT_OK = ();
8use Carp;
9$VERSION=1.0;
10use constant DEFAULT_UNIGENE_PATH => '/data/unigene/Hs.dat';
11
12=head1 NAME
13
14Boulder::Unigene - Fetch Unigene data records as parsed Boulder Stones
15
16=head1 SYNOPSIS
17
18  # parse a file of Unigene records
19  $ug = new Boulder::Unigene(-accessor=>'File',
20                             -param => '/data/unigene/Hs.dat');
21  while (my $s = $ug->get) {
22    print $s->Identifier;
23    print $s->Gene;
24  }
25
26  # parse flatfile records yourself
27  open (UG,"/data/unigene/Hs.dat");
28  local $/ = "*RECORD*";
29  while (<UG>) {
30     my $s = Boulder::Unigene->parse($_);
31     # etc.
32  }
33
34=head1 DESCRIPTION
35
36Boulder::Unigene provides retrieval and parsing services for UNIGENE records
37
38Boulder::Unigene provides retrieval and parsing services for NCBI
39Unigene records.  It returns Unigene entries in L<Stone>
40format, allowing easy access to the various fields and values.
41Boulder::Unigene is a descendent of Boulder::Stream, and provides a
42stream-like interface to a series of Stone objects.
43
44Access to Unigene is provided by one I<accessors>, which
45give access to  local Unigene database.  When you
46create a new Boulder::Unigene stream, you provide the
47accessors, along with accessor-specific parameters that control what
48entries to fetch.  The accessors is:
49
50=over 2
51
52=item File
53
54This provides access to local Unigene entries by reading from a flat file
55(typically Hs.dat file downloadable from NCBI's Ftp site).
56The stream will return a Stone corresponding to each of the entries in
57the file, starting from the top of the file and working downward.  The
58parameter is the path to the local file.
59
60=back
61
62It is also possible to parse a single Unigene entry from a text string
63stored in a scalar variable, returning a Stone object.
64
65=head2 Boulder::Unigene methods
66
67This section lists the public methods that the I<Boulder::Unigene>
68class makes available.
69
70=over 4
71
72=item new()
73
74   # Local fetch via File
75   $ug=new Boulder::Unigene(-accessor  =>  'File',
76                            -param     =>  '/data/unigene/Hs.dat');
77
78The new() method creates a new I<Boulder::Unigene> stream on the
79accessor provided.  The only possible accessors is B<File>.
80If successful, the method returns the stream
81object.  Otherwise it returns undef.
82
83new() takes the following arguments:
84
85	-accessor	Name of the accessor to use
86	-param		Parameters to pass to the accessor
87
88Specify the accessor to use with the B<-accessor> argument.  If not
89specified, it defaults to B<File>.
90
91B<-param> is an accessor-specific argument.  The possibilities is:
92
93For B<File>, the B<-param> argument must point to a string-valued
94scalar, which will be interpreted as the path to the file to read
95Unigene entries from.
96
97=item get()
98
99The get() method is inherited from I<Boulder::Stream>, and simply
100returns the next parsed Unigene Stone, or undef if there is nothing
101more to fetch.  It has the same semantics as the parent class,
102including the ability to restrict access to certain top-level tags.
103
104=item put()
105
106The put() method is inherited from the parent Boulder::Stream class,
107and will write the passed Stone to standard output in Boulder format.
108This means that it is currently not possible to write a
109Boulder::Unigene object back into Unigene flatfile form.
110
111=back
112
113=head1 OUTPUT TAGS
114
115The tags returned by the parsing operation are taken from the names shown in the Flat file
116Hs.dat since no better description of them is provided yet by the database source producer.
117
118=head2 Top-Level Tags
119
120These are tags that appear at the top level of the parsed Unigene
121entry.
122
123=over 4
124
125=item Identifier
126
127The Unigene identifier of this entry.  Identifier is a single-value tag.
128
129Example:
130
131      my $identifierNo = $s->Identifier;
132
133=item Title
134
135The Unigene title for this entry.
136
137Example:
138      my $titledef=$s->Title;
139
140=item Gene
141The Gene associated with   this Unigene entry
142
143Example:
144      my $thegene=$s->Gene;
145
146=item Cytoband
147The cytological band position of this entry
148
149Example:
150      my $thecytoband=$s->Cytoband;
151
152=item Counts
153The number of EST in this record
154
155Example:
156      my $thecounts=$s->Counts;
157
158=item LocusLink
159The id of the LocusLink entry associated with this record
160
161Example:
162      my $thelocuslink=$s->LocusLink;
163
164=item Chromosome
165This field contains a list, of the chromosomes numbers in which this entry has been linked
166
167Example:
168      my @theChromosome=$s->Chromosome;
169
170=back
171
172=head2 STS
173Multiple records in the form ^STS     ACC=XXXXXX NAME=YYYYYY
174
175=over 4
176
177=item ACC
178
179=item NAME
180
181=back
182
183=head2 TXMAP
184Multiple records in the form  ^TXMAP  XXXXXXX; MARKER=YYYYY; RHPANEL=ZZZZ
185
186The TXMAP tag points to a Stone record that contains multiple
187subtags.  Each subtag is the name of a feature which points, in turn,
188to a Stone that describes the feature's location and other attributes.
189
190Each feature will contain one or more of the following subtags:
191
192=over 4
193
194=item MARKER
195
196=item RHPANEL
197
198=back
199
200
201=head2 PROTSIM
202Multiple records in the form ^PROTSIM ORG=XXX; PROTID=DBID:YYY; PCT=ZZZ; ALN=QQQQ
203Where DBID is
204	PID for indicate presence of GenPept identifier,
205	SP to indicate SWISSPROT identifier,
206	PIR to indicate PIR identifier,
207	PRF to indicate ???
208
209=over 4
210
211=item ORG
212
213=item PROTID
214
215=item PCT
216
217=item ALN
218
219=back
220
221=head2 SEQUENCE
222Multiple records in the form ^SEQUENCE ACC=XXX; NID=YYYY; PID = CLONE= END= LID=
223
224=over
225
226=item ACC
227
228=item NID
229
230=item PID
231
232=item CLONE
233
234=item END
235
236=item LID
237
238=back
239
240=head1 SEE ALSO
241
242L<Boulder>, L<Boulder::Blast>, L<Boulder::Genbank>
243
244=head1 AUTHOR
245
246Lincoln Stein <lstein@cshl.org>.
247Luca I.G. Toldo <luca.toldo@merck.de>
248
249Copyright (c) 1997 Lincoln D. Stein
250Copyright (c) 1999 Luca I.G. Toldo
251
252This library is free software; you can redistribute it and/or modify
253it under the same terms as Perl itself.  See DISCLAIMER.txt for
254disclaimers of warranty.
255
256=cut
257
258#
259# Following did not require any changes compared to Genbank.pm
260#
261sub new  {
262    my($package,@parameters) = @_;
263    # superclass constructor
264    my($self) = new Boulder::Stream;
265
266    # figure out whether parameters are named.  Look for
267    # an initial '-'
268    if ($parameters[0]=~/^-/) {
269	my(%parameters) = @parameters;
270	$self->{'accessor'}=$parameters{'-accessor'} || 'File';
271	$self->{'param'}=$parameters{'-param'};
272	$self->{'OUT'}=$parameters{'-out'} || 'main::STDOUT';
273    } else {
274	$self->{'accessor'}='File';
275	$self->{'param'}=[@parameters];
276    }
277
278    croak "Require parameters" unless defined($self->{'param'});
279    $self->{'accessor'} = new {$self->{'accessor'}}($self->{'param'});
280
281    return bless $self,$package;
282}
283
284#
285# Following required no changes compared to Genbank.pm
286#
287sub read_record {
288    my($self,@tags) = @_;
289    my($s);
290
291    if (wantarray) {
292	my(@result);
293	while (!$self->{'done'}) {
294	    $s = $self->read_one_record(@tags);
295	    next unless $s;
296	    next if $query && !(&$query);
297	    push(@result,$s);
298	}
299	return @result;
300    }
301
302    # we get here if in a scalar context
303    while (!$self->{'done'}) {
304	$s = $self->read_one_record(@tags);
305	next unless $s;
306	return $s unless $query;
307	return $s if &$query;
308    }
309    return undef;
310}
311
312#<LIGT>
313# Here is everything new
314#</LIGT>
315sub parse {
316  my $self = shift;
317  my $record = shift;
318  return unless $record;
319  my $tags = shift;
320  my %ok;
321  %ok = map {$_ => 1} @$tags if ref($tags) eq 'ARRAY';
322    my($s,@lines,$line,$accumulated,$key,$keyword,$value,$feature,@features, $label);
323
324  $s = new Stone;
325#<LIGT> following this line the parsing of the record must be done
326#              each key-value pair is stored by the following command:
327#	$self->_addToStone($key,$value,$stone,\%ok);
328#
329# Process new record lines
330#
331#
332  (@recordlines)=split(/\n/,$record);
333 undef $unigeneid, $title, $gene,$cytoband, $locuslink, $chromosome, $scount;
334 undef $sts, $txmap,$protsim,$sequence;
335 undef @sts,@txmaps,@protsims,@sequences;
336  foreach $line  (@recordlines) {
337    if      ($line=~/^ID/) {
338     ($key,$unigeneid)=split(/\s+/,$line);
339     $self->_addToStone('Identifier',$unigeneid,$s,\%ok);
340    } elsif ($line=~/^TITLE/) {
341     (@titles)=split(/\s+/,$line);
342     shift @titles;
343     $title=join(' ',@titles);
344     $self->_addToStone('Title',$title,$s,\%ok);
345    } elsif ($line=~/^GENE/) {
346     ($key,$gene)=split(/\s+/,$line);
347     $self->_addToStone('Gene',$gene,$s,\%ok);
348    } elsif ($line=~/^CYTOBAND/) {
349     ($key,$cytoband)=split(/\s+/,$line);
350     $self->_addToStone('Cytoband',$cytoband,$s,\%ok);
351    } elsif ($line=~/^LOCUSLINK/) {
352     ($key,$locuslink)=split(/\s+/,$line);
353     $self->_addToStone('Locuslink',$locuslink,$s,\%ok);
354    } elsif ($line=~/^CHROMOSOME/) {
355     ($key,$chromosome)=split(/\s+/,$line);
356     $self->_addToStone('Chromosome',$chromosome,$s,\%ok);
357    } elsif ($line=~/^SCOUNT/) {
358     ($key,$scount)=split(/\s+/,$line);
359     $self->_addToStone('Scount',$scount,$s,\%ok);
360    } elsif ($line=~/^STS/) {
361#STS ACC=XXX; NAME=YYY;
362     (@sts)=split(/\s+/,$line); shift @sts;  $sts=join(' ',@sts);
363     ($tmpacc,$tmpname)=split(/\s+/,$sts);
364     ($jnk,$acc)=split(/\=/,$tmpacc);
365     ($jnk,$name)=split(/\=/,$tmpname);
366
367     undef @features;
368     $featurelabel="Accession"; $featurevalue=$name;
369     $feature = {'label'=>$featurelabel,'value'=>$featurevalue};
370     push(@features,$feature);
371     $featurelabel="Name";
372     $feature = {'label'=>$featurelabel,'value'=>$featurevalue};
373     push(@features,$feature);
374
375      $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok);
376    } elsif ($line=~/^TXMAP/) {
377#TXMAP  XXX; MARKER=YYY; RHPANEL=ZZZ;
378     (@txmaps)=split(/\s+/,$line); shift @txmaps;  $txmap=join(' ',@txmaps);
379#     $self->_addToStone('TXMAP',$txmap,$s,\%ok);
380    undef @features;
381     $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok);
382    } elsif ($line=~/^PROTSIM/) {
383#PROTSIM ORG=QQQ; PROTID=RRR; PCT=SSSS; ALN=TTTT;
384     (@protsims)=split(/\s+/,$line); shift @protsims;  $protsim=join(' ',@protsims);
385#     $self->_addToStone('PROTSIM',$protsim,$s,\%ok);
386    undef @features;
387     $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok);
388    } elsif ($line=~/^SEQUENCE/) {
389#SEQUENCE ACC=XXXX; NID=YYYY; PID=RRRRR; CLONE=QQQ; END=PPPP; LID=ZZZZ;
390     (@sequences)=split(/\s+/,$line); shift @sequences;  $sequence=join(' ',@sequences);
391#     $self->_addToStone('SEQUENCE',$sequence,$s,\%ok);
392    undef @features;
393     $self->_addFeaturesToStone(\@features,_trim($'),$s,\%ok);
394   }
395 }
396#</LIGT>
397  return $s;
398}
399
400#
401# Following is unchanged from Genbank.pm
402#
403sub read_one_record {
404  my($self,@tags) = @_;
405  my(%ok);
406  my $accessor = $self->{'accessor'};
407  my $record   = $accessor->fetch_next();
408  unless ($record) {
409    $self->{'done'}++;
410    return undef;
411  }
412
413  return $self->parse($record,\@tags);
414}
415
416#
417# Following is unchanged from Genbank.pm
418#
419sub _trim {
420    my($v) = @_;
421    $v=~s/^\s+//;
422    $v=~s/\s+$//;
423    return $v;
424}
425
426#
427# Following is unchanged from Genbank.pm
428#
429sub _canonicalize {
430  my $h = shift;
431  substr($h,0)=~tr/a-z/A-Z/;
432  substr($h,1,length($h)-1)=~tr/A-Z/a-z/;
433  $h;
434}
435
436#
437# Following is unchanged from Genbank.pm
438#
439sub _addToStone {
440    my($self,$xlabel,$value,$stone,$ok) = @_;
441    return unless !%{$ok} || $ok->{$xlabel};
442    $stone->insert(_canonicalize($xlabel),$value);
443}
444
445#<LIGT>
446# Following is entirely rewritten
447#</LIGT>
448sub _addFeaturesToStone {
449	my($self,$features,$basecount,$stone,$ok) = @_;
450	my($f) = new Stone;
451	foreach (@$features) {
452		my($q) = $_->{'value'};
453		my($label) = _canonicalize($_->{'label'});
454		my($position) = $q=~m!^([^/]+)!;
455		my @qualifiers = $q=~m!/(\w+)=([^/]+)!g;
456		my %qualifiers;
457		while (my($key,$value) = splice(@qualifiers,0,2)) {
458			$value =~ s/^\s*\"//;
459			$value =~s/\"\s*$//;
460			$value=~s/\s+//g if uc($key) eq 'TRANSLATION';
461			$qualifiers{_canonicalize($key)} = $value;
462		}
463		$f->insert($label=>new Stone('Position'=>$position,%qualifiers));
464	}
465	$stone->insert('Features',$f);
466}
467
468
469
470# -------------------------- DEFINITION OF ACCESSOR OBJECTS ------------------------------
471#<LIGT>
472#only name changes for avoid namespace collisions
473#</LIGT>
474package UnigeneAccessor;
475use Carp;
476
477sub new {
478    my($class,@parameters) = @_;
479    croak "UnigeneAccessor::new:  Abstract class\n";
480}
481
482sub fetch_next {
483    my($self) = @_;
484    croak "UnigeneAccessor::fetch_next: Abstract class\n";
485}
486
487sub DESTROY {
488}
489
490#<LIGT>
491# Following, only the File package since the only one supported.
492# If other access methods must be supported, then here appropriate
493# packages and methods must be implemented
494#</LIGT>
495package File;
496use Carp;
497@ISA=qw(UnigeneAccessor);
498$DEFAULT_PATH = Boulder::Unigene::DEFAULT_UNIGENE_PATH();
499
500#<LIGT>
501# Following, removed the search for the string locus in the file
502#   as validation that the input be compliant with parser
503#</LIGT>
504sub new {
505    my($package,$path) = @_;
506    $path = $DEFAULT_PATH unless $path;
507    open (UG,$path) or croak "File::new(): couldn't open $path: $!";
508    # read the junk at the beginning
509    my $found; $found++;
510    croak "File::new(): $path doesn't look like a Unigene flat file"
511	unless $found;
512    $_ = <UG>;
513    return bless {'fh'=>UG},$package;
514}
515
516#<LIGT>
517# Following, changed the record separator
518#</LIGT>
519sub fetch_next {
520    my $self = shift;
521    return undef unless $self->{'fh'};
522    local($/)="//\n";
523    my($line);
524    my($fh) = $self->{'fh'};
525    chomp($line = <$fh>);
526    return $line;
527}
528
5291;
530
531__END__
532
533