1# $Id: XML.pm,v 1.2 1999/11/30 21:06:05 lstein Exp $
2
3# Boulder::XML
4#
5# XML input/output for Stone objects
6
7package Boulder::XML;
8
9=head1 NAME
10
11Boulder::XML - XML format input/output for Boulder streams
12
13=head1 SYNOPSIS
14
15   use Boulder::XML;
16
17   $stream = Boulder::XML->newFh;
18
19   while ($stone = <$stream>) {
20        print $stream $stone;
21   }
22
23=head1 DESCRIPTION
24
25Boulder::XML generates BoulderIO streams from XML files and/or
26streams.  It is also able to output Boulder Stones in XML format.  Its
27semantics are similar to those of Boulder::Stream, except that there
28is never any pass-through behavior.
29
30Because XML was not designed for streaming, some care must be taken
31when reading an XML document into a series of Stones.  Consider this
32XML document:
33
34 <?xml version="1.0" standalone="yes"?>
35
36 <Paper>
37   <Author>Lincoln Stein</Author>
38   <Author>Jean Siao</Author>
39   <Date>September 29, 1999</Date>
40   <Copyright copyrighted="yes">1999 Lincoln Stein</Copright>
41   <Abstract>
42       This is the abstract.  It is not anything very fancy,
43       but it will do.
44   </Abstract>
45   <Citation>
46        <Author>Fitchberg J</Author>
47        <Journal>Journal of Irreproducible Results</Journal>
48        <Volume>23</Volume>
49        <Year>1998</Volume>
50   </Citation>
51   <Citation>
52        <Author>Clemenson V</Author>
53        <Journal>Ecumenica</Journal>
54        <Volume>10</Volume>
55        <Year>1968</Volume>
56   </Citation>
57   <Citation>
58        <Author>Ruggles M</Author>
59        <Journal>Journal of Aesthetic Surgery</Journal>
60        <Volume>10</Volume>
61        <Year>1999</Volume>
62   </Citation>
63 </Paper>
64
65Ordinarily the document will be construed as a single Paper tag
66containing subtags Author, Date, Copyright, Abstract, and so on.
67However it might be desirable to fetch out just the citation tags as a
68series of Stones.  In this case, you can declare Citation to be the
69top level tag by passing the B<-tag> argument to new(). Now calling
70get() will return each of the three Citation sections in turn.  If no
71tag is explicitly declared to be the top level tag, then Boulder::XML
72will take the first tag it sees in the document.
73
74It is possible to stream XML files.  You can either separate them into
75separate documents and use the automatic ARGV processing features of
76the BoulderIO library, or separate the XML documents using a
77B<delimiter> string similar to the delimiters used in MIME multipart
78documents.  By default, BoulderIO uses a delimiter of
79E<lt>!--Boulder::XML--E<gt>.
80
81B<This is not a general XML parsing engine!> Instead, it is a way to
82represent BoulderIO tag/value streams in XML format.  The module uses
83XML::Parser to parse the XML streams, and therefore any syntactic
84error in the stream can cause the XML parser to quit with an error.
85Another thing to be aware of is that there are certain XML
86constructions that will not translate into BoulderIO format, specifically
87free text that contains embedded tags.  This is OK:
88
89  <Author>Jean Siao</Author>
90
91but this is not:
92
93  <Author>The <Emphatic>extremely illustrious</Emphatic> Jean Siao</Author>
94
95In BoulderIO format, tags can contain other tags or text, but cannot
96contain a mixture of tags and text.
97
98=head2 CONSTRUCTORS
99
100=over 4
101
102=item $stream = Boulder::XML->new(*IN,*OUT);
103
104=item $stream = Boulder::XML->new(-in=>*IN,-out=>*OUT,-tag=>$tag,-delim=>$delim,-strip=>$strip)
105
106new() creates a new Boulder::XML stream that can be read from or
107written to.  All arguments are optional.
108
109 -in    Filehandle to read from.
110        If a file name is provided, will open the file.
111        Defaults to the magic <> filehandle.
112
113 -out   Filehandle to write to.
114        If a file name is provided, will open the file for writing.
115        Defaults to STDOUT
116
117 -tag   The top-level XML tag to consider as the Stone record.  Defaults
118        to the first tag seen when reading from an XML file, or to
119        E<lt>StoneE<gt> when writing to an output stream without
120        previously having read.
121
122 -delim Delimiter to use for delimiting multiple Stone objects in an
123        XML stream.
124
125 -strip If true, automatically strips leading and trailing whitespace
126        from text contained within tags.
127
128=item $fh = Boulder::XML->newFh(*IN,*OUT);
129
130=item $fh = Boulder::XML->newFh(-in=>*IN,-out=>*OUT,-tag=>$tag,-delim=>$delim,-strip=>$strip)
131
132The newFh() constructor creates a tied filehandle that can read and
133write Boulder::XML streams.  Invoking <> on the filehandle will
134perform a get(), returning a Stone object.  Calling print() on the
135filehandle will perform a put(), writing a Stone object to output in
136XML format.
137
138=back
139
140=head2 METHODS
141
142=over 4
143
144=item $stone = $stream->get()
145
146=item $stream->put($stone)
147
148=item $done = $stream->done
149
150All these methods have the same semantics as the similar methods in
151L<Boulder::Stream>, except that pass-through behavior doesn't apply.
152
153=back
154
155=head1 AUTHOR
156
157Lincoln D. Stein <lstein@cshl.org>, Cold Spring Harbor Laboratory,
158Cold Spring Harbor, NY.  This module can be used and distributed on
159the same terms as Perl itself.
160
161=head1 SEE ALSO
162
163L<Boulder>, L<Boulder::Stream>, L<Stone>
164
165=cut
166
167use Boulder::Stream;
168use Stone;
169use XML::Parser;
170
171use strict;
172use vars qw(@ISA);
173
174@ISA = 'Boulder::Stream';
175*rearrange = \&Boulder::Stream::rearrange;
176*put = \&write_record;
177
178sub new {
179  my $package = shift;
180  my($in,$out,$tag,$delim,$strip) = rearrange(['IN','OUT','TAG','DELIM','STRIP'],@_);
181  my $self = bless {
182		    'top_level' => $tag,
183		    'delim' => $delim || '<!--Boulder::XML-->',
184		    'strip' => $strip,
185		    'in'  => Boulder::Stream->to_fh($in)    || \*ARGV,
186		    'out' => Boulder::Stream->to_fh($out,1) || \*STDOUT,
187		     },$package;
188  my $parser = XML::Parser->new(
189				ErrorContext => 2,
190				Stream_Delimiter => $self->{delim},
191			       );
192  @ARGV = ('-') if $self->{in} == \*ARGV and !@ARGV;
193  $parser->setHandlers(
194		       Start   => sub { $self->_start(@_) },
195		       Default => sub { $self->_default(@_) },
196		       End     => sub { $self->_end(@_) }
197		      );
198  $self->{'parser'} = $parser;
199  return $self;
200}
201
202sub read_one_record {
203  my ($self,@tags) = @_;
204  return shift @{$self->{stones}} if $self->{stones} && @{$self->{stones}};
205  my $fh = $self->magic_file_open || return;
206  $self->{parser}->parse($fh);
207  return shift @{$self->{stones}};
208}
209
210sub write_record {
211  my $self = shift;
212  my @stone = @_;
213  my $out = $self->{out};
214  print $out $self->{delim},"\n" if $self->{printed}++;
215  print $out qq(<?xml version="1.0" standalone="yes"?>\n\n);
216  for my $stone (@stone) {
217    next unless ref $stone && $stone->can('asXML');
218    print $out $stone->asXML($self->{top_level});
219  }
220}
221
222sub magic_file_open {
223  my $self = shift;
224  my $fh = $self->{in};
225  return $fh unless $fh == \*main::ARGV;
226  return $fh unless eof $fh;
227  return unless my $a = shift @ARGV;
228  open $fh,$a or die "$a: $!";
229  return $fh;
230}
231
232sub done {
233  my $self = shift;
234  return if defined $self->{stones} && @{$self->{stones}};
235  return eof $self->{in} if $self->{in} != \*main::ARGV;
236  return $self->{in} && eof $self->{in} && !@ARGV;
237}
238
239sub _default {
240  my ($self,$p, $string) = @_;
241  return unless $string=~/\S/;
242  if ($self->{'strip'}) { # strip leading whitespace
243    $string =~ s/^\s+//;
244    $string =~ s/\s+$//;
245  }
246
247  return unless $self->{stack} && @{$self->{stack}};
248  my $stone = $self->{stack}[-1];
249  my $current = $stone->name();
250  $current .= $string;
251  $stone->name($current);
252}
253
254sub _start {
255  my ($self,$p, $element, %attributes) = @_;
256
257  $self->{top_level} ||= $element;
258  if ($element eq $self->{top_level}) {
259    $self->{stack} = [$self->{stone} = new Stone];  # empty stone
260    $self->{stone}->attributes(\%attributes) if %attributes;
261    return;
262  }
263
264  return unless $self->{stack}[-1];
265  my $s = new Stone;
266  $self->{stack}[-1]->insert($element => $s);
267  push(@{$self->{stack}},$s);
268  $s->attributes(\%attributes) if %attributes;
269}
270
271sub _end {
272  my ($self,$p, $element) = @_;
273
274  pop @{$self->{stack}};
275
276  if ( $element eq $self->{top_level} ) {
277    push @{$self->{stones}},$self->{stone};
278    delete $self->{stone};
279    delete $self->{stack};
280  }
281
282}  # End end
283
284
2851;
286