1package XML::SAXDriver::Excel;
2
3use strict;
4
5use Spreadsheet::ParseExcel;
6
7use base qw(XML::SAX::Base);
8use vars qw($VERSION $NS_SAXDriver_Excel);
9$VERSION = '0.06';
10$NS_SAXDriver_Excel = 'http://xmlns.perl.org/sax/XML::SAXDriver::Excel';
11
12sub _parse_bytestream
13{
14  my $self = shift;
15  my $stream = shift;
16
17  die ("Cannot use String to parse a binary Excel file.  You can only use a file by setting SystemId");
18
19  $self->_parse_Excel(\$stream);
20}
21sub _parse_string
22{
23  my $self = shift;
24  my $stream = shift;
25
26  ### This is experimental due to the binary streams
27  #require IO::String;
28  #$io = IO::String->new($var);
29  #$self->_parse_Excel($io);
30
31  die ("Cannot use String to parse a binary Excel file.  You can only use a file by setting SystemId");
32
33  #my @strings = split("\n", $self->{ParseOptions}->{Source}{String});
34  #$self->_parse_Excel(\@strings);
35}
36sub _parse_systemid
37{
38  my $self = shift;
39  my $path = shift;
40
41  $self->_init();
42  $self->{ParseOptions}->{Parser}->Parse($path);
43  $self->_end();
44
45}
46
47sub _init {
48  my $self = shift;
49
50  ### Reset vars before parsing
51  $self->{_row} = [];  ## Used to push row values per row
52	$self->{_row_num} = -1;  ## Set at -1 since rows are counted from 0
53	$self->{_last_row_num} = 0;  ## Used to save the last row value received
54	$self->{_last_col} = 0;
55
56  $self->{ParseOptions}->{Parser} ||= Spreadsheet::ParseExcel->new(CellHandler => \&cb_routine, Object => $self, NotSetCell => 1);
57
58  $self->{ParseOptions}->{Headings_Handler} ||= \&_normalize_heading;
59
60  $self->{_document} = {};
61  $self->{ParseOptions}->{Handler}->start_document($self->{_document});
62  $self->xml_decl($self->{ParseOptions}->{Declaration});
63  my $pm_excel = $self->_create_node(
64                                  Prefix       => 'SAXDriver::Excel',
65                                  NamespaceURI => $NS_SAXDriver_Excel,
66                                  );
67  $self->start_prefix_mapping($pm_excel);
68  $self->end_prefix_mapping($pm_excel);
69
70  $self->{_doc_element} = {
71              Name => $self->{ParseOptions}->{File_Tag} || "records",
72              Attributes => {},
73          };
74
75  $self->{ParseOptions}->{Handler}->start_element($self->{_doc_element});
76}
77
78  ## Parse file or string
79
80
81sub _end
82{
83  my $self = shift;
84  _print_xml_finish($self);
85
86  ### Reset vars after parsing
87  $self->{_row} = [];  ## Used to push row values per row
88	$self->{_row_num} = -1;  ## Set at -1 since rows are counted from 0
89	$self->{_last_row_num} = 0;  ## Used to save the last row value received
90
91  $self->{ParseOptions}->{Handler}->end_element($self->{_doc_element});
92
93  return $self->{ParseOptions}->{Handler}->end_document($self->{_document});
94
95}
96
97sub cb_routine($$$$$$)
98{
99  my ($self, $oBook, $iSheet, $iRow, $iCol, $oCell) = @_;
100
101  my $oWkS = $oBook->{Worksheet}[$iSheet];
102
103  $self->{ParseOptions}->{Col_Headings} ||= [];
104
105if ($iCol < $oWkS->{MaxCol})
106  {
107
108    if ($self->{_last_col} > $iCol)
109  	{
110  	  while ($self->{_last_col} < $oWkS->{MaxCol})
111  	  {
112  	    push(@{$self->{_row}}, undef);
113  	    $self->{_last_col}++;
114  	  }
115  	  _print_xml(@_);
116  	}
117
118    if ($self->{_last_col} < $iCol)
119  	{
120  	  while ($self->{_last_col} < $iCol)
121  	  {
122  	    push(@{$self->{_row}}, undef);
123  	    $self->{_last_col}++;
124  	  }
125  	}
126
127  	  push(@{$self->{_row}}, $oCell->Value());
128  	  $self->{_last_row_num} = $iRow;
129  	  $self->{_last_col}++;
130  	  return;
131
132
133  }
134
135  push(@{$self->{_row}}, $oCell->Value());# if $flag == 0;
136
137  _print_xml(@_);
138  return;
139
140}
141
142sub _normalize_heading  ### Default if no Headings_Handler is provided
143{
144  my $heading= shift;
145  my $sub_char = shift || '_';
146  $heading =~ s/^\s//g;
147  $heading =~ s/\s$//g;
148  $heading =~ s/^([^a-zA-Z|^_|^:])/$sub_char/g;   ### We used to also replace the xml in the beginning, but I took it of per recommendation of Michael Rodriguez.
149  $heading =~ s/[^a-zA-Z|^-|^.|^0-9|^:]/$sub_char/g;
150  return $heading;
151}
152
153
154sub _print_xml
155{
156  my ($self, $oBook, $iSheet, $iRow, $iCol, $oCell) = @_;  ### Remember self is passed through the Spreadsheet::ParseExcel object
157
158  my $oWkS = $oBook->{Worksheet}[$iSheet];
159
160  $self->{_last_row_num} = $iRow;
161
162
163  $self->{_last_col} = 0;
164  my $temp_row = $oCell->Value();
165  $self->{_row_num} = $self->{_last_row_num};
166
167
168      if (!@{$self->{ParseOptions}->{Col_Headings}} && !$self->{ParseOptions}->{Dynamic_Col_Headings})
169      {
170              my $i = 1;
171              @{$self->{ParseOptions}->{Col_Headings}} = map { "column" . $i++ } @{$self->{_row}};
172      }
173      elsif (!@{$self->{ParseOptions}->{Col_Headings}} && $self->{ParseOptions}->{Dynamic_Col_Headings})
174      {
175              @{$self->{ParseOptions}->{Col_Headings}} = map { $self->{ParseOptions}->{Headings_Handler}->($_, $self->{ParseOptions}->{SubChar}); } @{$self->{_row}};
176              $self->{_row} = [];  ### Clear the @$row array
177              return;  ### So that it does not print the column headings as the content of the first node.
178      }
179
180
181      my $el = {
182        Name => $self->{ParseOptions}->{Parent_Tag} || "record",
183        Attributes => {},
184      };
185
186      $self->{ParseOptions}->{Handler}->start_element($el);
187
188      for (my $i = 0; $i <= $#{$self->{ParseOptions}->{Col_Headings}}; $i++) {
189          my $column = { Name => $self->{ParseOptions}->{Col_Headings}->[$i], Attributes => {} };
190
191          $self->{ParseOptions}->{Handler}->start_element($column);
192          $self->{ParseOptions}->{Handler}->characters({Data => $self->{_row}->[$i]});
193          $self->{ParseOptions}->{Handler}->end_element($column);
194      }
195
196      $self->{ParseOptions}->{Handler}->end_element($el);
197
198  $self->{_row} = [];  ### Clear $row and start the new row processing
199
200}
201
202sub _print_xml_finish
203{
204  my $self = shift;
205
206  while (@{$self->{_row}} < 9)
207  {
208    push(@{$self->{_row}}, undef);
209  }
210
211  my $el = {
212        Name => $self->{ParseOptions}->{Parent_Tag} || "record",
213        Attributes => {},
214      };
215
216      $self->{ParseOptions}->{Handler}->start_element($el);
217
218      for (my $i = 0; $i <= $#{$self->{ParseOptions}->{Col_Headings}}; $i++) {
219          my $column = { Name => $self->{ParseOptions}->{Col_Headings}->[$i], Attributes => {} };
220
221          $self->{ParseOptions}->{Handler}->start_element($column);
222          $self->{ParseOptions}->{Handler}->characters({Data => $self->{_row}->[$i]});
223          $self->{ParseOptions}->{Handler}->end_element($column);
224      }
225
226      $self->{ParseOptions}->{Handler}->end_element($el);
227}
228
229sub _create_node {
230    shift;
231    # this may check for a factory later
232    return {@_};
233}
234
2351;
236__END__
237
238
239
240
241=head1 NAME
242
243  XML::SAXDriver::Excel - SAXDriver for converting Excel files to XML
244
245=head1 SYNOPSIS
246
247    use XML::SAXDriver::Excel;
248    my $driver = XML::SAXDriver::Excel->new(%attr);
249    $driver->parse(%attr);
250
251=head1 DESCRIPTION
252
253  XML::SAXDriver::Excel was developed as a complement to
254  XML::Excel, though it provides a SAX interface, for
255  gained performance and efficiency, to Excel files.
256  Specific object attributes and handlers are set to
257  define the behavior of the parse() method.  It does
258  not matter where you define your attributes.  If they
259  are defined in the new() method, they will apply to
260  all parse() calls.  You can override in any call to
261  parse() and it will remain local to that function call
262  and not effect the rest of the object.
263
264=head1 XML::SAXDriver::Excel properties
265
266  Source - (Reference to a String, ByteStream, SystemId)
267
268    String - **currently not supported** Contains literal Excel data.
269             Ex (Source => {String => $foo})
270
271    ByteStream - **currently not supported** Contains a filehandle reference.
272                 Ex. (Source => {ByteStream => \*STDIN})
273
274    SystemId - Contains the path to the file containing
275               the Excel  data. Ex (Source => {SystemId => '../excel/foo.xls'})
276
277
278  Handler - Contains the object to be used as a XML print handler
279
280  DTDHandler - Contains the object to be used as a XML DTD handler.
281               ****There is no DTD support available at this time.
282               I'll make it available in the next version.****
283
284  SubChar - Specifies the character(s) to use to substitute illegal chars in xml tag names, that
285            will be generated from the first row, but setting the Dynamic_Col_Headings.
286
287  Col_Headings - Reference to the array of column names to be used for XML tag names.
288
289  Dynamic_Col_Headings - Should be set if you want the XML tag names generated dynamically
290                         from the first row in Excel file.  **Make sure that the number of columns
291                         in your first row is equal to the largest row in the document.  You
292                         don't generally have to worry about if you are submitting valid Excel
293                         data, where each row will have the same number of columns, even if
294                         they are empty.
295
296  Headings_Handler - Should be used along with Dynamic_Col_Headings to provide a heading
297                     normalization handler, to conform the headings to the XML 1.0
298                     specifications.  If not provided, a default will be used that only
299                     works with ASCII chars, therefore any other character sets need to
300                     provide a custom handler!  The handler sub will be passed the heading
301                     string as the first argument.
302
303=head1 AUTHOR
304
305Ilya Sterin (isterin@cpan.org)
306
307=head1 SEE ALSO
308
309XML::Excel
310Spreadsheet::ParseExcel
311
312=cut
313