1 /* PSPP - a program for statistical analysis.
2    Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016  Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include <config.h>
18 
19 #include "data/gnumeric-reader.h"
20 #include "spreadsheet-reader.h"
21 
22 #include <assert.h>
23 #include <stdbool.h>
24 #include <errno.h>
25 #include <libxml/xmlreader.h>
26 #include <zlib.h>
27 
28 #include "data/case.h"
29 #include "data/casereader-provider.h"
30 #include "data/data-in.h"
31 #include "data/dictionary.h"
32 #include "data/format.h"
33 #include "data/identifier.h"
34 #include "data/value.h"
35 #include "data/variable.h"
36 #include "libpspp/i18n.h"
37 #include "libpspp/message.h"
38 #include "libpspp/misc.h"
39 #include "libpspp/str.h"
40 
41 #include "gl/c-strtod.h"
42 #include "gl/minmax.h"
43 #include "gl/xalloc.h"
44 
45 #include "gettext.h"
46 #define _(msgid) gettext (msgid)
47 #define N_(msgid) (msgid)
48 
49 /* Shamelessly lifted from the Gnumeric sources:
50    https://git.gnome.org/browse/gnumeric/tree/src/value.h
51  */
52 enum gnm_value_type
53 {
54   VALUE_EMPTY   = 10,
55   VALUE_BOOLEAN = 20,
56   VALUE_INTEGER = 30, /* Note, this was removed from gnumeric in 2006 - old versions may of
57 			 course still be around. New ones are supposed to use float.*/
58   VALUE_FLOAT   = 40,
59   VALUE_ERROR   = 50,
60   VALUE_STRING  = 60,
61   VALUE_CELLRANGE  = 70,
62   VALUE_ARRAY   = 80
63 };
64 
65 
66 
67 static void gnm_file_casereader_destroy (struct casereader *, void *);
68 
69 static struct ccase *gnm_file_casereader_read (struct casereader *, void *);
70 
71 
72 static const struct casereader_class gnm_file_casereader_class =
73   {
74     gnm_file_casereader_read,
75     gnm_file_casereader_destroy,
76     NULL,
77     NULL,
78   };
79 
80 enum reader_state
81   {
82     STATE_PRE_INIT = 0,        /* Initial state */
83     STATE_SHEET_COUNT,      /* Found the sheet index */
84     STATE_INIT ,           /* Other Initial state */
85     STATE_SHEET_START,     /* Found the start of a sheet */
86     STATE_SHEET_NAME,      /* Found the sheet name */
87     STATE_MAXROW,
88     STATE_MAXCOL,
89     STATE_SHEET_FOUND,     /* Found the sheet that we actually want */
90     STATE_CELLS_START,     /* Found the start of the cell array */
91     STATE_CELL             /* Found a cell */
92   };
93 
94 struct sheet_detail
95 {
96   /* The name of the sheet (utf8 encoding) */
97   char *name;
98 
99   int start_col;
100   int stop_col;
101   int start_row;
102   int stop_row;
103 
104   int maxcol;
105   int maxrow;
106 };
107 
108 struct state_data
109 {
110   /* The libxml reader for this instance */
111   xmlTextReaderPtr xtr;
112 
113   /* An internal state variable */
114   enum reader_state state;
115 
116   int node_type;
117   int current_sheet;
118 
119   int row;
120   int col;
121 
122   int min_col;
123 };
124 
125 
126 static void
127 state_data_destroy (struct state_data *sd)
128 {
129   xmlFreeTextReader (sd->xtr);
130 }
131 
132 
133 struct gnumeric_reader
134 {
135   struct spreadsheet spreadsheet;
136 
137   struct state_data rsd;
138   struct state_data msd;
139 
140   int start_col;
141   int stop_col;
142   int start_row;
143   int stop_row;
144 
145   struct sheet_detail *sheets;
146 
147   const xmlChar *target_sheet;
148   int target_sheet_index;
149 
150   struct caseproto *proto;
151   struct dictionary *dict;
152   struct ccase *first_case;
153   bool used_first_case;
154 
155   enum gnm_value_type vtype;
156 };
157 
158 
159 void
160 gnumeric_unref (struct spreadsheet *s)
161 {
162   struct gnumeric_reader *r = (struct gnumeric_reader *) s;
163 
164   if (0 == --s->ref_cnt)
165     {
166       int i;
167 
168       for (i = 0; i < s->n_sheets; ++i)
169 	{
170 	  xmlFree (r->sheets[i].name);
171 	}
172 
173 
174       free (r->sheets);
175       state_data_destroy (&r->msd);
176 
177       dict_unref (r->dict);
178 
179       free (s->file_name);
180 
181       free (r);
182     }
183 }
184 
185 
186 const char *
187 gnumeric_get_sheet_name (struct spreadsheet *s, int n)
188 {
189   struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
190   assert (n < s->n_sheets);
191 
192   return gr->sheets[n].name;
193 }
194 
195 
196 static void process_node (struct gnumeric_reader *r, struct state_data *sd);
197 
198 
199 
200 char *
201 gnumeric_get_sheet_range (struct spreadsheet *s, int n)
202 {
203   int ret;
204   struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
205 
206   assert (n < s->n_sheets);
207 
208   while (
209 	 (gr->sheets[n].stop_col == -1)
210 	 &&
211 	 (1 == (ret = xmlTextReaderRead (gr->msd.xtr)))
212 	)
213     {
214       process_node (gr, &gr->msd);
215     }
216 
217   return create_cell_range (
218 			  gr->sheets[n].start_col,
219 			  gr->sheets[n].start_row,
220 			  gr->sheets[n].stop_col,
221 			  gr->sheets[n].stop_row);
222 }
223 
224 
225 static void
226 gnm_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
227 {
228   struct gnumeric_reader *r = r_;
229 
230   if (r == NULL)
231 	return ;
232 
233   state_data_destroy (&r->rsd);
234 
235   if (r->first_case &&  ! r->used_first_case)
236     case_unref (r->first_case);
237 
238   if (r->proto)
239     caseproto_unref (r->proto);
240 
241   gnumeric_unref (&r->spreadsheet);
242 }
243 
244 
245 static void
246 process_node (struct gnumeric_reader *r, struct state_data *sd)
247 {
248   xmlChar *name = xmlTextReaderName (sd->xtr);
249   if (name == NULL)
250     name = xmlStrdup (_xml ("--"));
251 
252   sd->node_type = xmlTextReaderNodeType (sd->xtr);
253 
254   switch (sd->state)
255     {
256     case STATE_PRE_INIT:
257       sd->current_sheet = -1;
258       if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) &&
259 	  XML_READER_TYPE_ELEMENT  == sd->node_type)
260 	{
261 	  sd->state = STATE_SHEET_COUNT;
262 	}
263       break;
264 
265     case STATE_SHEET_COUNT:
266       if (0 == xmlStrcasecmp (name, _xml("gnm:SheetName")) &&
267 	  XML_READER_TYPE_ELEMENT  == sd->node_type)
268 	{
269 	  ++sd->current_sheet;
270 	  if (sd->current_sheet + 1 > r->spreadsheet.n_sheets)
271 	    {
272 	      struct sheet_detail *detail ;
273 	      r->sheets = xrealloc (r->sheets, (sd->current_sheet + 1) * sizeof *r->sheets);
274 	      detail = &r->sheets[sd->current_sheet];
275 	      detail->start_col = detail->stop_col = detail->start_row = detail->stop_row = -1;
276 	      detail->name = NULL;
277 	      r->spreadsheet.n_sheets = sd->current_sheet + 1;
278 	    }
279 	}
280       else if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) &&
281 	  XML_READER_TYPE_END_ELEMENT  == sd->node_type)
282 	{
283 	  sd->state = STATE_INIT;
284 	  sd->current_sheet = -1;
285 	}
286       else if (XML_READER_TYPE_TEXT == sd->node_type)
287 	{
288 	  if (r->sheets [r->spreadsheet.n_sheets - 1].name == NULL)
289 	    r->sheets [r->spreadsheet.n_sheets - 1].name = CHAR_CAST (char *, xmlTextReaderValue (sd->xtr));
290 	}
291       break;
292 
293     case STATE_INIT:
294       if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
295 	  XML_READER_TYPE_ELEMENT  == sd->node_type)
296 	{
297 	  ++sd->current_sheet;
298 	  sd->state = STATE_SHEET_START;
299 	}
300       break;
301     case STATE_SHEET_START:
302       if (0 == xmlStrcasecmp (name, _xml("gnm:Name"))  &&
303 	  XML_READER_TYPE_ELEMENT  == sd->node_type)
304 	{
305 	  sd->state = STATE_SHEET_NAME;
306 	}
307       break;
308     case STATE_SHEET_NAME:
309       if (0 == xmlStrcasecmp (name, _xml("gnm:Name"))  &&
310 	  XML_READER_TYPE_END_ELEMENT  == sd->node_type)
311 	{
312 	  sd->state = STATE_INIT;
313 	}
314       else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet"))  &&
315 	  XML_READER_TYPE_END_ELEMENT  == sd->node_type)
316 	{
317 	  sd->state = STATE_INIT;
318 	}
319       else if (XML_READER_TYPE_TEXT == sd->node_type)
320 	{
321        	  if (r->target_sheet != NULL)
322 	    {
323 	      xmlChar *value = xmlTextReaderValue (sd->xtr);
324 	      if (0 == xmlStrcmp (value, r->target_sheet))
325 		sd->state = STATE_SHEET_FOUND;
326 	      free (value);
327 	    }
328 	  else if (r->target_sheet_index == sd->current_sheet + 1)
329 	    {
330 	      sd->state = STATE_SHEET_FOUND;
331 	    }
332 	  else if (r->target_sheet_index == -1)
333 	    {
334 	      sd->state = STATE_SHEET_FOUND;
335 	    }
336 	}
337       break;
338     case STATE_SHEET_FOUND:
339       if (0 == xmlStrcasecmp (name, _xml("gnm:Cells"))  &&
340 	  XML_READER_TYPE_ELEMENT  == sd->node_type)
341 	{
342 	  sd->min_col = INT_MAX;
343 	  if (! xmlTextReaderIsEmptyElement (sd->xtr))
344 	    sd->state = STATE_CELLS_START;
345 	}
346       else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxRow"))  &&
347 	  XML_READER_TYPE_ELEMENT  == sd->node_type)
348 	{
349 	  sd->state = STATE_MAXROW;
350 	}
351       else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol"))  &&
352 	  XML_READER_TYPE_ELEMENT  == sd->node_type)
353 	{
354 	  sd->state = STATE_MAXCOL;
355 	}
356       else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet"))  &&
357 	  XML_READER_TYPE_END_ELEMENT  == sd->node_type)
358 	{
359       	  sd->state = STATE_INIT;
360 	}
361       break;
362     case STATE_MAXROW:
363       if (0 == xmlStrcasecmp (name, _xml("gnm:MaxRow"))  &&
364 	  XML_READER_TYPE_END_ELEMENT  == sd->node_type)
365 	{
366 	  sd->state = STATE_SHEET_FOUND;
367 	}
368       else if (sd->node_type == XML_READER_TYPE_TEXT)
369 	{
370 	  xmlChar *value = xmlTextReaderValue (sd->xtr);
371 	  r->sheets[sd->current_sheet].maxrow = _xmlchar_to_int (value);
372 	  xmlFree (value);
373 	}
374       break;
375     case STATE_MAXCOL:
376       if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol"))  &&
377 	  XML_READER_TYPE_END_ELEMENT  == sd->node_type)
378 	{
379 	  sd->state = STATE_SHEET_FOUND;
380 	}
381       else if (sd->node_type == XML_READER_TYPE_TEXT)
382 	{
383 	  xmlChar *value = xmlTextReaderValue (sd->xtr);
384 	  r->sheets[sd->current_sheet].maxcol = _xmlchar_to_int (value);
385 	  xmlFree (value);
386 	}
387       break;
388     case STATE_CELLS_START:
389       if (0 == xmlStrcasecmp (name, _xml ("gnm:Cell"))  &&
390 	  XML_READER_TYPE_ELEMENT  == sd->node_type)
391 	{
392 	  xmlChar *attr = NULL;
393 
394 	  attr = xmlTextReaderGetAttribute (sd->xtr, _xml ("Col"));
395 	  sd->col =  _xmlchar_to_int (attr);
396 	  free (attr);
397 
398 	  if (sd->col < sd->min_col)
399 	    sd->min_col = sd->col;
400 
401 	  attr = xmlTextReaderGetAttribute (sd->xtr, _xml ("Row"));
402 	  sd->row = _xmlchar_to_int (attr);
403 	  free (attr);
404 
405 	  if (r->sheets[sd->current_sheet].start_row == -1)
406 	    {
407 	      r->sheets[sd->current_sheet].start_row = sd->row;
408 	    }
409 
410 	  if (r->sheets[sd->current_sheet].start_col == -1)
411 	    {
412 	      r->sheets[sd->current_sheet].start_col = sd->col;
413 	    }
414 	  if (! xmlTextReaderIsEmptyElement (sd->xtr))
415 	    sd->state = STATE_CELL;
416 	}
417       else if ((0 == xmlStrcasecmp (name, _xml("gnm:Cells")))  &&  (XML_READER_TYPE_END_ELEMENT  == sd->node_type))
418 	{
419 	  r->sheets[sd->current_sheet].stop_col = sd->col;
420 	  r->sheets[sd->current_sheet].stop_row = sd->row;
421 	  sd->state = STATE_SHEET_NAME;
422 	}
423       break;
424     case STATE_CELL:
425       if (0 == xmlStrcasecmp (name, _xml("gnm:Cell"))  && XML_READER_TYPE_END_ELEMENT  == sd->node_type)
426 	{
427 	  sd->state = STATE_CELLS_START;
428 	}
429       break;
430     default:
431       break;
432     };
433 
434   xmlFree (name);
435 }
436 
437 
438 /*
439    Sets the VAR of case C, to the value corresponding to the xml string XV
440  */
441 static void
442 convert_xml_string_to_value (struct ccase *c, const struct variable *var,
443 			     const xmlChar *xv, enum gnm_value_type type, int col, int row)
444 {
445   union value *v = case_data_rw (c, var);
446 
447   if (xv == NULL)
448     value_set_missing (v, var_get_width (var));
449   else if (var_is_alpha (var))
450     value_copy_str_rpad (v, var_get_width (var), xv, ' ');
451   else if (type == VALUE_FLOAT || type == VALUE_INTEGER)
452     {
453       const char *text = CHAR_CAST (const char *, xv);
454       char *endptr;
455 
456       errno = 0;
457       v->f = c_strtod (text, &endptr);
458       if (errno != 0 || endptr == text)
459 	v->f = SYSMIS;
460     }
461   else
462     {
463       const char *text = CHAR_CAST (const char *, xv);
464 
465       const struct fmt_spec *fmt = var_get_write_format (var);
466 
467       char *m = data_in (ss_cstr (text), "UTF-8",
468 			 fmt->type,
469 			 v,
470 			 var_get_width (var),
471 			 "UTF-8");
472 
473       if (m)
474 	{
475 	  char buf [FMT_STRING_LEN_MAX + 1];
476 	  char *cell = create_cell_ref (col, row);
477 
478 	  msg (MW, _("Cannot convert the value in the spreadsheet cell %s to format (%s): %s"),
479 	       cell, fmt_to_string (fmt, buf), m);
480 	  free (cell);
481 	}
482       free (m);
483     }
484 }
485 
486 struct var_spec
487 {
488   char *name;
489   int width;
490   xmlChar *first_value;
491   int first_type;
492 };
493 
494 
495 static void
496 gnumeric_error_handler (void *ctx, const char *mesg,
497 			xmlParserSeverities sev UNUSED,
498 			xmlTextReaderLocatorPtr loc)
499 {
500   struct gnumeric_reader *r = ctx;
501 
502   msg (MW, _("There was a problem whilst reading the %s file `%s' (near line %d): `%s'"),
503        "Gnumeric",
504        r->spreadsheet.file_name,
505        xmlTextReaderLocatorLineNumber (loc),
506        mesg);
507 }
508 
509 static struct gnumeric_reader *
510 gnumeric_reopen (struct gnumeric_reader *r, const char *filename, bool show_errors)
511 {
512   int ret = -1;
513   struct state_data *sd;
514 
515   xmlTextReaderPtr xtr;
516   gzFile gz;
517 
518   assert (r == NULL || filename == NULL);
519 
520   if (filename)
521     {
522       gz = gzopen (filename, "r");
523     }
524   else
525     {
526       gz = gzopen (r->spreadsheet.file_name, "r");
527     }
528 
529   if (NULL == gz)
530     return NULL;
531 
532 
533   xtr = xmlReaderForIO ((xmlInputReadCallback) gzread,
534 			(xmlInputCloseCallback) gzclose, gz,
535 			NULL, NULL,
536 			show_errors ? 0 : (XML_PARSE_NOERROR | XML_PARSE_NOWARNING));
537 
538   if (xtr == NULL)
539     {
540       gzclose (gz);
541       return NULL;
542     }
543 
544   if (r == NULL)
545     {
546       r = xzalloc (sizeof *r);
547       r->spreadsheet.n_sheets = -1;
548       r->spreadsheet.file_name = strdup (filename);
549       sd = &r->msd;
550     }
551   else
552     {
553       sd = &r->rsd;
554     }
555 
556   if (show_errors)
557     xmlTextReaderSetErrorHandler (xtr, gnumeric_error_handler, r);
558 
559   r->target_sheet = NULL;
560   r->target_sheet_index = -1;
561 
562   sd->row = sd->col = -1;
563   sd->state = STATE_PRE_INIT;
564   sd->xtr = xtr;
565   r->spreadsheet.ref_cnt++;
566 
567 
568   /* Advance to the start of the workbook.
569      This gives us some confidence that we are actually dealing with a gnumeric
570      spreadsheet.
571    */
572   while ((sd->state != STATE_INIT)
573 	  && 1 == (ret = xmlTextReaderRead (sd->xtr)))
574     {
575       process_node (r, sd);
576     }
577 
578 
579   if (ret != 1)
580     {
581       /* Does not seem to be a gnumeric file */
582       gnumeric_unref (&r->spreadsheet);
583       return NULL;
584     }
585 
586   r->spreadsheet.type = SPREADSHEET_GNUMERIC;
587 
588   if (show_errors)
589     {
590       const xmlChar *enc = xmlTextReaderConstEncoding (sd->xtr);
591       xmlCharEncoding xce = xmlParseCharEncoding (CHAR_CAST (const char *, enc));
592 
593       if (XML_CHAR_ENCODING_UTF8 != xce)
594 	{
595 	  /* I have been told that ALL gnumeric files are UTF8 encoded.  If that is correct, this
596 	     can never happen. */
597 	  msg (MW, _("The gnumeric file `%s' is encoded as %s instead of the usual UTF-8 encoding. "
598 		     "Any non-ascii characters will be incorrectly imported."),
599 	       r->spreadsheet.file_name,
600 	       enc);
601 	}
602     }
603 
604   return r;
605 }
606 
607 
608 struct spreadsheet *
609 gnumeric_probe (const char *filename, bool report_errors)
610 {
611   struct gnumeric_reader *r = gnumeric_reopen (NULL, filename, report_errors);
612 
613   return &r->spreadsheet;
614 }
615 
616 
617 struct casereader *
618 gnumeric_make_reader (struct spreadsheet *spreadsheet,
619 		      const struct spreadsheet_read_options *opts)
620 {
621   int type = 0;
622   int x = 0;
623   struct gnumeric_reader *r = NULL;
624   unsigned long int vstart = 0;
625   int ret;
626   casenumber n_cases = CASENUMBER_MAX;
627   int i;
628   struct var_spec *var_spec = NULL;
629   int n_var_specs = 0;
630 
631   r = (struct gnumeric_reader *) (spreadsheet);
632 
633   r = gnumeric_reopen (r, NULL, true);
634 
635   if (opts->cell_range)
636     {
637       if (! convert_cell_ref (opts->cell_range,
638 			       &r->start_col, &r->start_row,
639 			       &r->stop_col, &r->stop_row))
640 	{
641 	  msg (SE, _("Invalid cell range `%s'"),
642 	       opts->cell_range);
643 	  goto error;
644 	}
645     }
646   else
647     {
648       r->start_col = -1;
649       r->start_row = 0;
650       r->stop_col = -1;
651       r->stop_row = -1;
652     }
653 
654   r->target_sheet = BAD_CAST opts->sheet_name;
655   r->target_sheet_index = opts->sheet_index;
656   r->rsd.row = r->rsd.col = -1;
657   r->rsd.current_sheet = -1;
658   r->first_case = NULL;
659   r->proto = NULL;
660 
661   /* Advance to the start of the cells for the target sheet */
662   while ((r->rsd.state != STATE_CELL || r->rsd.row < r->start_row)
663 	  && 1 == (ret = xmlTextReaderRead (r->rsd.xtr)))
664     {
665       xmlChar *value ;
666       process_node (r, &r->rsd);
667       value = xmlTextReaderValue (r->rsd.xtr);
668 
669       if (r->rsd.state == STATE_MAXROW  && r->rsd.node_type == XML_READER_TYPE_TEXT)
670 	{
671 	  n_cases = 1 + _xmlchar_to_int (value) ;
672 	}
673       free (value);
674     }
675 
676   /* If a range has been given, then  use that to calculate the number
677      of cases */
678   if (opts->cell_range)
679     {
680       n_cases = MIN (n_cases, r->stop_row - r->start_row + 1);
681     }
682 
683   if (opts->read_names)
684     {
685       r->start_row++;
686       n_cases --;
687     }
688 
689 
690   /* Read in the first row of cells,
691      including the headers if read_names was set */
692   while (
693 	 ((r->rsd.state == STATE_CELLS_START && r->rsd.row <= r->start_row) || r->rsd.state == STATE_CELL)
694 	 && (ret = xmlTextReaderRead (r->rsd.xtr))
695 	)
696     {
697       int idx;
698 
699       if (r->rsd.state == STATE_CELL && r->rsd.node_type == XML_READER_TYPE_TEXT)
700 	{
701 	  xmlChar *attr =
702 	    xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
703 
704 	  type  =  _xmlchar_to_int (attr);
705 
706 	  xmlFree (attr);
707 	}
708 
709       process_node (r, &r->rsd);
710 
711       if (r->rsd.row > r->start_row)
712 	{
713 	  xmlChar *attr =
714 	    xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
715 
716 	  r->vtype  =  _xmlchar_to_int (attr);
717 
718 	  xmlFree (attr);
719 	  break;
720 	}
721 
722       if (r->rsd.col < r->start_col ||
723 	   (r->stop_col != -1 && r->rsd.col > r->stop_col))
724 	continue;
725 
726       idx = r->rsd.col - r->start_col;
727 
728       if (idx  >= n_var_specs)
729 	{
730 	  int i;
731 	  var_spec = xrealloc (var_spec, sizeof (*var_spec) * (idx + 1));
732 	  for (i = n_var_specs; i <= idx; ++i)
733 	  {
734 	    var_spec [i].name = NULL;
735 	    var_spec [i].width = -1;
736 	    var_spec [i].first_value = NULL;
737 	    var_spec [i].first_type = -1;
738 	  }
739 	  n_var_specs =  idx + 1 ;
740 	}
741 
742       var_spec [idx].first_type = type;
743 
744       if (r->rsd.node_type == XML_READER_TYPE_TEXT)
745 	{
746 	  xmlChar *value = xmlTextReaderValue (r->rsd.xtr);
747 	  const char *text  = CHAR_CAST (const char *, value);
748 
749 	  if (r->rsd.row < r->start_row)
750 	    {
751 	      if (opts->read_names)
752 		{
753 		  var_spec [idx].name = xstrdup (text);
754 		}
755 	    }
756 	  else
757 	    {
758 	      var_spec [idx].first_value = xmlStrdup (value);
759 
760 	      if (-1 ==  var_spec [idx].width)
761 		var_spec [idx].width = (opts->asw == -1) ?
762 		  ROUND_UP (strlen(text), SPREADSHEET_DEFAULT_WIDTH) : opts->asw;
763 	    }
764 
765 	  free (value);
766 	}
767       else if (r->rsd.node_type == XML_READER_TYPE_ELEMENT
768 		&& r->rsd.state == STATE_CELL)
769 	{
770 	  if (r->rsd.row == r->start_row)
771 	    {
772 	      xmlChar *attr =
773 		xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
774 
775 	      if (NULL == attr || VALUE_STRING !=  _xmlchar_to_int (attr))
776 		var_spec [idx].width = 0;
777 
778 	      free (attr);
779 	    }
780 	}
781     }
782 
783   {
784     const xmlChar *enc = xmlTextReaderConstEncoding (r->rsd.xtr);
785     if (enc == NULL)
786       goto error;
787     /* Create the dictionary and populate it */
788     spreadsheet->dict = r->dict = dict_create (CHAR_CAST (const char *, enc));
789   }
790 
791   for (i = 0 ; i < n_var_specs ; ++i)
792     {
793       char *name;
794 
795       if ((var_spec[i].name == NULL) && (var_spec[i].first_value == NULL))
796 	continue;
797 
798       /* Probably no data exists for this variable, so allocate a
799 	 default width */
800       if (var_spec[i].width == -1)
801 	var_spec[i].width = SPREADSHEET_DEFAULT_WIDTH;
802 
803       name = dict_make_unique_var_name (r->dict, var_spec[i].name, &vstart);
804       dict_create_var (r->dict, name, var_spec[i].width);
805       free (name);
806     }
807 
808   /* Create the first case, and cache it */
809   r->used_first_case = false;
810 
811   if (n_var_specs ==  0)
812     {
813       msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
814            spreadsheet->file_name);
815       goto error;
816     }
817 
818   r->proto = caseproto_ref (dict_get_proto (r->dict));
819   r->first_case = case_create (r->proto);
820   case_set_missing (r->first_case);
821 
822 
823   for (i = 0 ; i < n_var_specs ; ++i)
824     {
825       const struct variable *var;
826 
827       if ((var_spec[i].name == NULL) && (var_spec[i].first_value == NULL))
828 	continue;
829 
830       var = dict_get_var (r->dict, x++);
831 
832       convert_xml_string_to_value (r->first_case, var,
833 				   var_spec[i].first_value,
834 				   var_spec[i].first_type,
835 				   r->rsd.col + i - 1,
836 				   r->rsd.row - 1);
837     }
838 
839   for (i = 0 ; i < n_var_specs ; ++i)
840     {
841       free (var_spec[i].first_value);
842       free (var_spec[i].name);
843     }
844 
845   free (var_spec);
846 
847 
848   return casereader_create_sequential
849     (NULL,
850      r->proto,
851      n_cases,
852      &gnm_file_casereader_class, r);
853 
854 
855  error:
856   for (i = 0 ; i < n_var_specs ; ++i)
857     {
858       free (var_spec[i].first_value);
859       free (var_spec[i].name);
860     }
861 
862   free (var_spec);
863 
864   gnm_file_casereader_destroy (NULL, r);
865 
866   return NULL;
867 };
868 
869 
870 /* Reads and returns one case from READER's file.  Returns a null
871    pointer on failure. */
872 static struct ccase *
873 gnm_file_casereader_read (struct casereader *reader UNUSED, void *r_)
874 {
875   struct ccase *c;
876   int ret = 0;
877 
878   struct gnumeric_reader *r = r_;
879   int current_row = r->rsd.row;
880 
881   if (!r->used_first_case)
882     {
883       r->used_first_case = true;
884       return r->first_case;
885     }
886 
887   c = case_create (r->proto);
888   case_set_missing (c);
889 
890   if (r->start_col == -1)
891     r->start_col = r->rsd.min_col;
892 
893 
894   while ((r->rsd.state == STATE_CELL || r->rsd.state == STATE_CELLS_START)
895 	 && r->rsd.row == current_row && (ret = xmlTextReaderRead (r->rsd.xtr)))
896     {
897       process_node (r, &r->rsd);
898 
899       if (r->rsd.state == STATE_CELL && r->rsd.node_type == XML_READER_TYPE_ELEMENT)
900 	{
901 	  xmlChar *attr =
902 	    xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
903 
904 	  r->vtype  = _xmlchar_to_int (attr);
905 
906 	  xmlFree (attr);
907 	}
908 
909       if (r->rsd.col < r->start_col || (r->stop_col != -1 &&
910 				     r->rsd.col > r->stop_col))
911 	continue;
912 
913       if (r->rsd.col - r->start_col >= caseproto_get_n_widths (r->proto))
914 	continue;
915 
916       if (r->stop_row != -1 && r->rsd.row > r->stop_row)
917 	break;
918 
919 
920       if (r->rsd.node_type == XML_READER_TYPE_TEXT)
921 	{
922 	  xmlChar *value = xmlTextReaderValue (r->rsd.xtr);
923 	  const int idx = r->rsd.col - r->start_col;
924 	  const struct variable *var = dict_get_var (r->dict, idx);
925 
926 	  convert_xml_string_to_value (c, var, value, r->vtype,
927 				       r->rsd.col, r->rsd.row);
928 
929 	  xmlFree (value);
930 	}
931     }
932 
933   if (ret == 1)
934     return c;
935   else
936     {
937       case_unref (c);
938       return NULL;
939     }
940 }
941