1 /*
2 * stf.c : Utilizes the stf-parse engine and the dialog-stf to provide a plug-in for
3 * importing text files with a structure (CSV/fixed width)
4 *
5 * Copyright (C) Almer. S. Tigelaar <almer@gnome.org>
6 * Copyright (C) 1999-2009 Morten Welinder (terra@gnome.org)
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, see <https://www.gnu.org/licenses/>.
20 */
21
22 #include <gnumeric-config.h>
23 #include <glib/gi18n-lib.h>
24 #include <gnumeric.h>
25 #include <stf.h>
26 #include <stf-export.h>
27
28 #include <goffice/goffice.h>
29 #include <cell.h>
30 #include <sheet.h>
31 #include <sheet-view.h>
32 #include <sheet-style.h>
33 #include <style.h>
34 #include <mstyle.h>
35 #include <command-context.h>
36 #include <wbc-gtk.h>
37 #include <workbook-view.h>
38 #include <workbook.h>
39 #include <dialogs/dialog-stf.h>
40 #include <dialogs/dialog-stf-export.h>
41 #include <position.h>
42 #include <expr.h>
43 #include <value.h>
44 #include <gnm-format.h>
45 #include <selection.h>
46 #include <ranges.h>
47 #include <clipboard.h>
48 #include <parse-util.h>
49 #include <commands.h>
50 #include <gui-util.h>
51 #include <gutils.h>
52
53 #include <gsf/gsf-input.h>
54 #include <string.h>
55 #include <gsf/gsf-output.h>
56 #include <gsf/gsf-output-memory.h>
57 #include <gsf/gsf-utils.h>
58 #include <locale.h>
59
60 static void
stf_warning(GOIOContext * context,char const * msg)61 stf_warning (GOIOContext *context, char const *msg)
62 {
63 /*
64 * Using go_cmd_context_error_import will destroy the
65 * successfully imported portion. We ought to have a
66 * way to issue a warning.
67 */
68 if (GNM_IS_WBC_GTK (context->impl))
69 go_gtk_notice_dialog
70 (wbcg_toplevel (WBC_GTK (context->impl)),
71 GTK_MESSAGE_WARNING,
72 "%s", msg);
73 else
74 g_warning ("%s", msg);
75 }
76
77
78 /*
79 * stf_open_and_read:
80 * @filename: name of the file to open&read
81 *
82 * Will open filename, read the file into a g_alloced memory buffer
83 *
84 * NOTE : The returned buffer has to be g_freed by the calling routine.
85 *
86 * returns : a buffer containing the file contents
87 */
88 static char *
stf_open_and_read(G_GNUC_UNUSED GOIOContext * context,GsfInput * input,size_t * readsize)89 stf_open_and_read (G_GNUC_UNUSED GOIOContext *context, GsfInput *input, size_t *readsize)
90 {
91 gpointer result;
92 gulong allocsize;
93 gsf_off_t size = gsf_input_size (input);
94
95 if (gsf_input_seek (input, 0, G_SEEK_SET))
96 return NULL;
97
98 *readsize = (size_t) size;
99 if ((gsf_off_t) *readsize != size) /* Check for overflow */
100 return NULL;
101 size++;
102 allocsize = (gulong) size;
103 if ((gsf_off_t) allocsize != size) /* Check for overflow */
104 return NULL;
105 result = g_try_malloc (allocsize);
106 if (result == NULL)
107 return NULL;
108
109 *((char *)result + *readsize) = '\0';
110
111 if (*readsize > 0 && gsf_input_read (input, *readsize, result) == NULL) {
112 g_warning ("gsf_input_read failed.");
113 g_free (result);
114 result = NULL;
115 }
116 return result;
117 }
118
119 static char *
stf_preparse(GOIOContext * context,GsfInput * input,size_t * data_len)120 stf_preparse (GOIOContext *context, GsfInput *input, size_t *data_len)
121 {
122 char *data;
123
124 data = stf_open_and_read (context, input, data_len);
125
126 if (!data) {
127 if (context)
128 go_cmd_context_error_import (GO_CMD_CONTEXT (context),
129 _("Error while trying to read file"));
130 return NULL;
131 }
132
133 return data;
134 }
135
136 static gboolean
stf_store_results(DialogStfResult_t * dialogresult,Sheet * sheet,int start_col,int start_row)137 stf_store_results (DialogStfResult_t *dialogresult,
138 Sheet *sheet, int start_col, int start_row)
139 {
140 return stf_parse_sheet (dialogresult->parseoptions,
141 dialogresult->text, NULL, sheet,
142 start_col, start_row);
143 }
144
145 static void
resize_columns(Sheet * sheet)146 resize_columns (Sheet *sheet)
147 {
148 GnmRange r;
149
150 if (gnm_debug_flag ("stf"))
151 g_printerr ("Auto-fitting columns...\n");
152
153 /* If we have lots of rows, auto-fitting will take a very long
154 time. It is probably better to look at only, say, 1000 rows
155 of data. */
156 range_init_full_sheet (&r, sheet);
157 r.end.row = MIN (r.end.row, 1000);
158
159 colrow_autofit (sheet, &r, TRUE,
160 TRUE, /* Ignore strings */
161 TRUE, /* Don't shrink */
162 TRUE, /* Don't shrink */
163 NULL, NULL);
164 if (gnm_debug_flag ("stf"))
165 g_printerr ("Auto-fitting columns... done\n");
166
167 sheet_queue_respan (sheet, 0, gnm_sheet_get_last_row (sheet));
168 }
169
170
171 /*
172 * stf_read_workbook:
173 * @fo: file opener
174 * @enc: encoding of file
175 * @context: command context
176 * @book: workbook
177 * @input: file to read from+convert
178 *
179 * Main routine, handles importing a file including all dialog mumbo-jumbo
180 */
181 static void
stf_read_workbook(G_GNUC_UNUSED GOFileOpener const * fo,gchar const * enc,GOIOContext * context,GoView * view,GsfInput * input)182 stf_read_workbook (G_GNUC_UNUSED GOFileOpener const *fo, gchar const *enc,
183 GOIOContext *context, GoView *view, GsfInput *input)
184 {
185 DialogStfResult_t *dialogresult = NULL;
186 char *name, *nameutf8 = NULL;
187 char *data = NULL;
188 size_t data_len;
189 WorkbookView *wbv = GNM_WORKBOOK_VIEW (view);
190
191 if (!GNM_IS_WBC_GTK (context->impl)) {
192 go_io_error_string (context, _("This importer can only be used with a GUI."));
193 return;
194 }
195
196 name = g_path_get_basename (gsf_input_name (input));
197 nameutf8 = g_filename_to_utf8 (name, -1, NULL, NULL, NULL);
198 g_free (name);
199 if (!nameutf8) {
200 g_warning ("Failed to convert filename to UTF-8. This shouldn't happen here.");
201 goto out;
202 }
203
204 data = stf_preparse (context, input, &data_len);
205 if (!data)
206 goto out;
207
208 dialogresult = stf_dialog (WBC_GTK (context->impl),
209 enc, FALSE, NULL, FALSE,
210 nameutf8, data, data_len);
211 if (dialogresult != NULL) {
212 Workbook *book = wb_view_get_workbook (wbv);
213 int cols = dialogresult->colcount, rows = dialogresult->rowcount;
214 Sheet *sheet;
215
216 gnm_sheet_suggest_size (&cols, &rows);
217 sheet = sheet_new (book, nameutf8, cols, rows);
218 workbook_sheet_attach (book, sheet);
219 if (stf_store_results (dialogresult, sheet, 0, 0)) {
220 workbook_recalc_all (book);
221 resize_columns (sheet);
222 workbook_set_saveinfo
223 (book,
224 GO_FILE_FL_WRITE_ONLY,
225 go_file_saver_for_id
226 ("Gnumeric_stf:stf_assistant"));
227 } else {
228 /* the user has cancelled */
229 /* the caller should notice that we have no sheets */
230 workbook_sheet_delete (sheet);
231 }
232 }
233
234 out:
235 g_free (nameutf8);
236 g_free (data);
237 if (dialogresult != NULL)
238 stf_dialog_result_free (dialogresult);
239 }
240
241 static GnmValue *
cb_get_content(GnmCellIter const * iter,GsfOutput * buf)242 cb_get_content (GnmCellIter const *iter, GsfOutput *buf)
243 {
244 GnmCell *cell;
245
246 if (NULL != (cell = iter->cell)) {
247 char *tmp;
248 if (gnm_cell_has_expr (cell))
249 tmp = gnm_expr_top_as_string (cell->base.texpr,
250 &iter->pp, iter->pp.sheet->convs);
251 else if (VALUE_FMT (cell->value) != NULL)
252 tmp = format_value (NULL, cell->value, -1,
253 workbook_date_conv (iter->pp.wb));
254 else
255 tmp = value_get_as_string (cell->value);
256
257 gsf_output_write (buf, strlen (tmp), tmp);
258 g_free (tmp);
259 }
260 gsf_output_write (buf, 1, "\n");
261
262 return NULL;
263 }
264
265 /**
266 * stf_text_to_columns:
267 * @wbc: The control making the request
268 * @cc:
269 *
270 * Main routine, handles importing a file including all dialog mumbo-jumbo
271 **/
272 void
stf_text_to_columns(WorkbookControl * wbc,GOCmdContext * cc)273 stf_text_to_columns (WorkbookControl *wbc, GOCmdContext *cc)
274 {
275 DialogStfResult_t *dialogresult = NULL;
276 SheetView *sv;
277 Sheet *src_sheet, *target_sheet;
278 GnmRange const *src;
279 GnmRange target;
280 GsfOutput *buf;
281 guint8 const *data;
282 size_t data_len;
283
284 sv = wb_control_cur_sheet_view (wbc);
285 src_sheet = sv_sheet (sv);
286 src = selection_first_range (sv, cc, _("Text to Columns"));
287 if (src == NULL)
288 return;
289 if (range_width (src) > 1) {
290 go_cmd_context_error (cc, g_error_new (go_error_invalid (), 0,
291 _("Only one column of input data can be parsed at a time")));
292 return;
293 }
294
295 /* FIXME : how to do this cleanly ? */
296 if (!GNM_IS_WBC_GTK (wbc))
297 return;
298
299 #warning Add UI for this
300 target_sheet = src_sheet;
301 target = *src;
302 range_translate (&target, target_sheet, 1, 0);
303
304 buf = gsf_output_memory_new ();
305 sheet_foreach_cell_in_range (src_sheet, CELL_ITER_ALL, src,
306 (CellIterFunc) &cb_get_content, buf);
307
308 gsf_output_close (buf);
309 data = gsf_output_memory_get_bytes (GSF_OUTPUT_MEMORY (buf));
310 data_len = (size_t)gsf_output_size (buf);
311 if (data_len == 0) {
312 go_cmd_context_error_import (GO_CMD_CONTEXT (cc),
313 _("There is no data "
314 "to convert"));
315 } else {
316 dialogresult = stf_dialog (WBC_GTK (wbc),
317 NULL, FALSE, NULL, FALSE,
318 _("Text to Columns"),
319 data, data_len);
320 }
321 if (dialogresult != NULL) {
322 GnmCellRegion *cr = stf_parse_region (dialogresult->parseoptions,
323 dialogresult->text, NULL, target_sheet->workbook);
324 if (cr != NULL) {
325 stf_dialog_result_attach_formats_to_cr (dialogresult, cr);
326 target.end.col = target.start.col + cr->cols - 1;
327 target.end.row = target.start.row + cr->rows - 1;
328 }
329 if (cr == NULL ||
330 cmd_text_to_columns (wbc, src, src_sheet,
331 &target, target_sheet, cr))
332 go_cmd_context_error_import (GO_CMD_CONTEXT (cc),
333 _("Error while trying to "
334 "parse data into sheet"));
335 stf_dialog_result_free (dialogresult);
336 }
337
338 g_object_unref (buf);
339 }
340
341 static void
clear_stray_NULs(GOIOContext * context,GString * utf8data)342 clear_stray_NULs (GOIOContext *context, GString *utf8data)
343 {
344 char *cpointer, *endpointer;
345 int null_chars = 0;
346 char const *valid_end;
347
348 cpointer = utf8data->str;
349 endpointer = utf8data->str + utf8data->len;
350 while (*cpointer != 0)
351 cpointer++;
352 while (cpointer != endpointer) {
353 null_chars++;
354 *cpointer = ' ';
355 while (*cpointer != 0)
356 cpointer++;
357 }
358 if (null_chars > 0) {
359 gchar const *format;
360 gchar *msg;
361 format = ngettext ("The file contains %d NUL character. "
362 "It has been changed to a space.",
363 "The file contains %d NUL characters. "
364 "They have been changed to spaces.",
365 null_chars);
366 msg = g_strdup_printf (format, null_chars);
367 stf_warning (context, msg);
368 g_free (msg);
369 }
370
371 if (!g_utf8_validate (utf8data->str, utf8data->len, &valid_end)) {
372 g_string_truncate (utf8data, valid_end - utf8data->str);
373 stf_warning (context, _("The file contains invalid UTF-8 encoded characters and has been truncated"));
374 }
375 }
376
377 /*
378 * stf_read_workbook_auto_csvtab:
379 * @fo: file opener
380 * @enc: optional encoding
381 * @context: command context
382 * @book: workbook
383 * @input: file to read from+convert
384 *
385 * Attempt to auto-detect CSV or tab-delimited file
386 */
387 static void
stf_read_workbook_auto_csvtab(G_GNUC_UNUSED GOFileOpener const * fo,gchar const * enc,GOIOContext * context,GoView * view,GsfInput * input)388 stf_read_workbook_auto_csvtab (G_GNUC_UNUSED GOFileOpener const *fo, gchar const *enc,
389 GOIOContext *context,
390 GoView *view, GsfInput *input)
391 {
392 Sheet *sheet;
393 Workbook *book;
394 char *name;
395 char *data;
396 GString *utf8data;
397 size_t data_len;
398 StfParseOptions_t *po;
399 const char *gsfname;
400 int cols, rows, i;
401 GStringChunk *lines_chunk;
402 GPtrArray *lines;
403 WorkbookView *wbv = GNM_WORKBOOK_VIEW (view);
404
405 g_return_if_fail (context != NULL);
406 g_return_if_fail (wbv != NULL);
407
408 book = wb_view_get_workbook (wbv);
409
410 data = stf_preparse (context, input, &data_len);
411 if (!data)
412 return;
413
414 enc = go_guess_encoding (data, data_len, enc, &utf8data, NULL);
415 g_free (data);
416
417 if (!enc) {
418 go_cmd_context_error_import (GO_CMD_CONTEXT (context),
419 _("That file is not in the given encoding."));
420 return;
421 }
422
423 clear_stray_NULs (context, utf8data);
424
425 /*
426 * Try to get the filename we're reading from. This is not a
427 * great way.
428 */
429 gsfname = gsf_input_name (input);
430
431 {
432 const char *ext = gsf_extension_pointer (gsfname);
433 gboolean iscsv = ext && strcasecmp (ext, "csv") == 0;
434 if (iscsv)
435 po = stf_parse_options_guess_csv (utf8data->str);
436 else
437 po = stf_parse_options_guess (utf8data->str);
438 }
439
440 lines_chunk = g_string_chunk_new (100 * 1024);
441 lines = stf_parse_general (po, lines_chunk,
442 utf8data->str, utf8data->str + utf8data->len);
443 rows = lines->len;
444 cols = 0;
445 for (i = 0; i < rows; i++) {
446 GPtrArray *line = g_ptr_array_index (lines, i);
447 cols = MAX (cols, (int)line->len);
448 }
449 gnm_sheet_suggest_size (&cols, &rows);
450 stf_parse_general_free (lines);
451 g_string_chunk_free (lines_chunk);
452
453 name = g_path_get_basename (gsfname);
454 sheet = sheet_new (book, name, cols, rows);
455 g_free (name);
456 workbook_sheet_attach (book, sheet);
457
458 if (stf_parse_sheet (po, utf8data->str, NULL, sheet, 0, 0)) {
459 gboolean is_csv;
460 workbook_recalc_all (book);
461 resize_columns (sheet);
462 if (po->cols_exceeded || po->rows_exceeded) {
463 stf_warning (context,
464 _("Some data did not fit on the "
465 "sheet and was dropped."));
466 }
467 is_csv = po->sep.chr && po->sep.chr[0] == ',';
468 workbook_set_saveinfo
469 (book,
470 GO_FILE_FL_WRITE_ONLY,
471 go_file_saver_for_id
472 (is_csv ? "Gnumeric_stf:stf_csv" : "Gnumeric_stf:stf_assistant"));
473 } else {
474 workbook_sheet_delete (sheet);
475 go_cmd_context_error_import (GO_CMD_CONTEXT (context),
476 _("Parse error while trying to parse data into sheet"));
477 }
478
479
480 stf_parse_options_free (po);
481 g_string_free (utf8data, TRUE);
482 }
483
484 /***********************************************************************************/
485
486 static void
stf_write_csv(GOFileSaver const * fs,GOIOContext * context,GoView const * view,GsfOutput * output)487 stf_write_csv (GOFileSaver const *fs, GOIOContext *context,
488 GoView const *view, GsfOutput *output)
489 {
490 GPtrArray *sheets;
491 WorkbookView *wbv = GNM_WORKBOOK_VIEW (view);
492
493 GnmStfExport *config = g_object_new
494 (GNM_STF_EXPORT_TYPE,
495 "sink", output,
496 "quoting-triggers", ", \t\n\"",
497 NULL);
498
499 sheets = gnm_file_saver_get_sheets (fs, wbv, FALSE);
500 if (sheets) {
501 unsigned ui;
502 for (ui = 0; ui < sheets->len; ui++) {
503 Sheet *sheet = g_ptr_array_index (sheets, ui);
504 gnm_stf_export_options_sheet_list_add (config, sheet);
505 }
506 }
507
508 if (gnm_stf_export (config) == FALSE)
509 go_cmd_context_error_import (GO_CMD_CONTEXT (context),
510 _("Error while trying to write CSV file"));
511
512 g_object_unref (config);
513 }
514
515 static gboolean
csv_tsv_probe(GOFileOpener const * fo,GsfInput * input,GOFileProbeLevel pl)516 csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
517 {
518 /* Rough and ready heuristic. If the first N bytes have no
519 * unprintable characters this may be text */
520 const gsf_off_t N = 512;
521
522 if (pl == GO_FILE_PROBE_CONTENT) {
523 guint8 const *header;
524 gsf_off_t i;
525 char const *enc = NULL;
526 GString *header_utf8;
527 char const *p;
528 gboolean ok = TRUE;
529
530 if (gsf_input_seek (input, 0, G_SEEK_SET))
531 return FALSE;
532 i = gsf_input_remaining (input);
533
534 /* If someone ships us an empty file, accept it only if
535 it has a proper name. */
536 if (i == 0)
537 return csv_tsv_probe (fo, input, GO_FILE_PROBE_FILE_NAME);
538
539 if (i > N) i = N;
540 if (NULL == (header = gsf_input_read (input, i, NULL)))
541 return FALSE;
542
543 enc = go_guess_encoding (header, i, NULL, &header_utf8, NULL);
544 if (!enc)
545 return FALSE;
546
547 for (p = header_utf8->str; *p; p = g_utf8_next_char (p)) {
548 gunichar uc = g_utf8_get_char (p);
549 /* isprint might not be true for these: */
550 if (uc == '\n' || uc == '\t' || uc == '\r')
551 continue;
552 /* Also, ignore a byte-order mark which may be used to
553 * indicate UTF-8; see
554 * http://en.wikipedia.org/wiki/Byte_Order_Mark for
555 * background.
556 */
557 if (p == header_utf8->str && uc == 0x0000FEFF) {
558 continue;
559 }
560 if (!g_unichar_isprint (uc)) {
561 ok = FALSE;
562 break;
563 }
564 }
565
566 g_string_free (header_utf8, TRUE);
567 return ok;
568 } else {
569 char const *name = gsf_input_name (input);
570 if (name == NULL)
571 return FALSE;
572 name = gsf_extension_pointer (name);
573 return (name != NULL &&
574 (g_ascii_strcasecmp (name, "csv") == 0 ||
575 g_ascii_strcasecmp (name, "tsv") == 0 ||
576 g_ascii_strcasecmp (name, "txt") == 0));
577 }
578 }
579
580 /**
581 * stf_init: (skip)
582 */
583 void
stf_init(void)584 stf_init (void)
585 {
586 GSList *suffixes = go_slist_create (
587 g_strdup ("csv"),
588 g_strdup ("tsv"),
589 g_strdup ("txt"),
590 NULL);
591 GSList *mimes = go_slist_create (
592 g_strdup ("application/tab-separated-values"),
593 g_strdup ("text/comma-separated-values"),
594 g_strdup ("text/csv"),
595 g_strdup ("text/x-csv"),
596 g_strdup ("text/spreadsheet"),
597 g_strdup ("text/tab-separated-values"),
598 NULL);
599 GSList *mimes_txt = go_slist_create (
600 g_strdup ("text/plain"),
601 g_strdup ("text/csv"),
602 g_strdup ("text/x-csv"),
603 g_strdup ("text/comma-separated-values"),
604 g_strdup ("text/tab-separated-values"),
605 NULL);
606 GOFileSaver *saver;
607 GOFileOpener *opener;
608
609 opener = go_file_opener_new_with_enc (
610 "Gnumeric_stf:stf_csvtab",
611 _("Comma or tab separated values (CSV/TSV)"),
612 suffixes, mimes,
613 csv_tsv_probe, stf_read_workbook_auto_csvtab);
614 go_file_opener_register (opener, 0);
615 g_object_unref (opener);
616
617 opener = go_file_opener_new_with_enc (
618 "Gnumeric_stf:stf_assistant",
619 _("Text import (configurable)"),
620 NULL, mimes_txt,
621 NULL, stf_read_workbook);
622 g_object_set (G_OBJECT (opener), "interactive-only", TRUE, NULL);
623 go_file_opener_register (opener, 0);
624 g_object_unref (opener);
625
626 saver = gnm_stf_file_saver_create ("Gnumeric_stf:stf_assistant");
627 /* Unlike the opener, the saver doesn't require interaction. */
628 go_file_saver_register (saver);
629 g_object_unref (saver);
630
631 saver = go_file_saver_new (
632 "Gnumeric_stf:stf_csv", "csv",
633 _("Comma separated values (CSV)"),
634 GO_FILE_FL_MANUAL_REMEMBER, stf_write_csv);
635 go_file_saver_set_save_scope (saver, GO_FILE_SAVE_SHEET);
636 g_object_set (G_OBJECT (saver), "sheet-selection", TRUE, NULL);
637 go_file_saver_register (saver);
638 g_object_unref (saver);
639 }
640
641 /**
642 * stf_shutdown: (skip)
643 */
644 void
stf_shutdown(void)645 stf_shutdown (void)
646 {
647 go_file_saver_unregister
648 (go_file_saver_for_id ("Gnumeric_stf:stf_assistant"));
649 go_file_saver_unregister
650 (go_file_saver_for_id ("Gnumeric_stf:stf_csv"));
651
652 go_file_opener_unregister
653 (go_file_opener_for_id ("Gnumeric_stf:stf_csvtab"));
654 go_file_opener_unregister
655 (go_file_opener_for_id ("Gnumeric_stf:stf_assistant"));
656 }
657