1# -*- coding: utf-8 -*-
2"""
3CSV File Import Widget
4----------------------
5
6"""
7import sys
8import types
9import os
10import csv
11import enum
12import io
13import traceback
14import warnings
15import logging
16import weakref
17import json
18
19import gzip
20import lzma
21import bz2
22import zipfile
23
24from xml.sax.saxutils import escape
25from functools import singledispatch
26from contextlib import ExitStack
27
28import typing
29from typing import (
30    List, Tuple, Dict, Optional, Any, Callable, Iterable,
31    Union, AnyStr, BinaryIO, Set, Type, Mapping, Sequence, NamedTuple
32)
33
34from AnyQt.QtCore import (
35    Qt, QFileInfo, QTimer, QSettings, QObject, QSize, QMimeDatabase, QMimeType
36)
37from AnyQt.QtGui import (
38    QStandardItem, QStandardItemModel, QPalette, QColor, QIcon
39)
40from AnyQt.QtWidgets import (
41    QLabel, QComboBox, QPushButton, QDialog, QDialogButtonBox, QGridLayout,
42    QVBoxLayout, QSizePolicy, QStyle, QFileIconProvider, QFileDialog,
43    QApplication, QMessageBox, QTextBrowser, QMenu
44)
45from AnyQt.QtCore import pyqtSlot as Slot, pyqtSignal as Signal
46
47import numpy as np
48import pandas.errors
49import pandas as pd
50
51from pandas.api import types as pdtypes
52
53import Orange.data
54from Orange.misc.collections import natural_sorted
55
56from Orange.widgets import widget, gui, settings
57from Orange.widgets.utils.concurrent import PyOwned
58from Orange.widgets.utils import (
59    textimport, concurrent as qconcurrent, unique_everseen, enum_get, qname
60)
61from Orange.widgets.utils.combobox import ItemStyledComboBox
62from Orange.widgets.utils.pathutils import (
63    PathItem, VarPath, AbsPath, samepath, prettyfypath, isprefixed,
64)
65from Orange.widgets.utils.overlay import OverlayWidget
66from Orange.widgets.utils.settings import (
67    QSettings_readArray, QSettings_writeArray
68)
69
70if typing.TYPE_CHECKING:
71    # pylint: disable=invalid-name
72    T = typing.TypeVar("T")
73    K = typing.TypeVar("K")
74    E = typing.TypeVar("E", bound=enum.Enum)
75
76__all__ = ["OWCSVFileImport"]
77
78_log = logging.getLogger(__name__)
79
80ColumnType = textimport.ColumnType
81RowSpec = textimport.RowSpec
82
83
84def dialect_eq(lhs, rhs):
85    # type: (csv.Dialect, csv.Dialect) -> bool
86    """Compare 2 `csv.Dialect` instances for equality."""
87    return (lhs.delimiter == rhs.delimiter and
88            lhs.quotechar == rhs.quotechar and
89            lhs.doublequote == rhs.doublequote and
90            lhs.escapechar == rhs.escapechar and
91            lhs.quoting == rhs.quoting and
92            lhs.skipinitialspace == rhs.skipinitialspace)
93
94
95class Options:
96    """
97    Stored options for loading CSV-like file.
98
99    Arguments
100    ---------
101    encoding : str
102        A encoding to use for reading.
103    dialect : csv.Dialect
104        A csv.Dialect instance.
105    columntypes: Iterable[Tuple[range, ColumnType]]
106        A list of column type ranges specifying the types for columns.
107        Need not list all columns. Columns not listed are assumed to have auto
108        type inference.
109    rowspec : Iterable[Tuple[range, RowSpec]]
110         A list of row spec ranges.
111    decimal_separator : str
112        Decimal separator - a single character string; default: `"."`
113    group_separator : str
114        Thousands group separator - empty or a single character string;
115        default: empty string
116    """
117    RowSpec = RowSpec
118    ColumnType = ColumnType
119
120    def __init__(self, encoding='utf-8', dialect=csv.excel(),
121                 columntypes: Iterable[Tuple[range, 'ColumnType']] = (),
122                 rowspec=((range(0, 1), RowSpec.Header),),
123                 decimal_separator=".", group_separator="") -> None:
124        self.encoding = encoding
125        self.dialect = dialect
126        self.columntypes = list(columntypes)  # type: List[Tuple[range, ColumnType]]
127        self.rowspec = list(rowspec)  # type: List[Tuple[range, RowSpec]]
128        self.decimal_separator = decimal_separator
129        self.group_separator = group_separator
130
131    def __eq__(self, other):
132        """
133        Compare this instance to `other` for equality.
134        """
135        if isinstance(other, Options):
136            return (dialect_eq(self.dialect, other.dialect) and
137                    self.encoding == other.encoding and
138                    self.columntypes == other.columntypes and
139                    self.rowspec == other.rowspec and
140                    self.group_separator == other.group_separator and
141                    self.decimal_separator == other.decimal_separator)
142        else:
143            return NotImplemented
144
145    def __repr__(self):
146        class_, args = self.__reduce__()
147        return "{}{!r}".format(class_.__name__, args)
148    __str__ = __repr__
149
150    def __reduce__(self):
151        return type(self), (self.encoding, self.dialect,
152                            self.columntypes, self.rowspec)
153
154    def as_dict(self):
155        # type: () -> Dict[str, Any]
156        """
157        Return return Option parameters as plain types suitable for
158        serialization (e.g JSON serializable).
159        """
160        return {
161            "encoding": self.encoding,
162            "delimiter": self.dialect.delimiter,
163            "quotechar": self.dialect.quotechar,
164            "doublequote": self.dialect.doublequote,
165            "skipinitialspace": self.dialect.skipinitialspace,
166            "quoting": self.dialect.quoting,
167            "columntypes": Options.spec_as_encodable(self.columntypes),
168            "rowspec": Options.spec_as_encodable(self.rowspec),
169            "decimal_separator": self.decimal_separator,
170            "group_separator": self.group_separator,
171        }
172
173    @staticmethod
174    def from_dict(mapping):
175        # type: (Dict[str, Any]) -> Options
176        """
177        Reconstruct a `Options` from a plain dictionary (see :func:`as_dict`).
178        """
179        encoding = mapping["encoding"]
180        delimiter = mapping["delimiter"]
181        quotechar = mapping["quotechar"]
182        doublequote = mapping["doublequote"]
183        quoting = mapping["quoting"]
184        skipinitialspace = mapping["skipinitialspace"]
185
186        dialect = textimport.Dialect(
187            delimiter, quotechar, None, doublequote, skipinitialspace,
188            quoting=quoting)
189
190        colspec = mapping["columntypes"]
191        rowspec = mapping["rowspec"]
192        colspec = Options.spec_from_encodable(colspec, ColumnType)
193        rowspec = Options.spec_from_encodable(rowspec, RowSpec)
194        decimal = mapping.get("decimal_separator", ".")
195        group = mapping.get("group_separator", "")
196
197        return Options(encoding, dialect, colspec, rowspec,
198                       decimal_separator=decimal,
199                       group_separator=group)
200
201    @staticmethod
202    def spec_as_encodable(spec):
203        # type: (Iterable[Tuple[range, enum.Enum]]) -> List[Dict[str, Any]]
204        return [{"start": r.start, "stop": r.stop, "value": value.name}
205                for r, value in spec]
206
207    @staticmethod
208    def spec_from_encodable(spec, enumtype):
209        # type: (Iterable[Dict[str, Any]], Type[E]) -> List[Tuple[range, E]]
210        r = []
211        for v in spec:
212            try:
213                start, stop, name = v["start"], v["stop"], v["value"]
214            except (KeyError, ValueError):
215                pass
216            else:
217                r.append((range(start, stop), enum_get(enumtype, name, None)))
218        return r
219
220
221class CSVImportDialog(QDialog):
222    """
223    A dialog for selecting CSV file import options.
224    """
225    def __init__(self, parent=None, flags=Qt.Dialog, **kwargs):
226        super().__init__(parent, flags, **kwargs)
227        self.setLayout(QVBoxLayout())
228
229        self._options = None
230        self._path = None
231        # Finalizer for opened file handle (in _update_preview)
232        self.__finalizer = None  # type: Optional[Callable[[], None]]
233        self._optionswidget = textimport.CSVImportWidget()
234        self._optionswidget.previewReadErrorOccurred.connect(
235            self.__on_preview_error
236        )
237        self._optionswidget.previewModelReset.connect(
238            self.__on_preview_reset
239        )
240        self._buttons = buttons = QDialogButtonBox(
241            orientation=Qt.Horizontal,
242            standardButtons=(QDialogButtonBox.Ok | QDialogButtonBox.Cancel |
243                             QDialogButtonBox.Reset |
244                             QDialogButtonBox.RestoreDefaults),
245            objectName="dialog-button-box",
246        )
247        # TODO: Help button
248        buttons.accepted.connect(self.accept)
249        buttons.rejected.connect(self.reject)
250
251        b = buttons.button(QDialogButtonBox.Reset)
252        b.clicked.connect(self.reset)
253        b = buttons.button(QDialogButtonBox.RestoreDefaults)
254        b.clicked.connect(self.restoreDefaults)
255        self.layout().addWidget(self._optionswidget)
256        self.layout().addWidget(buttons)
257        self.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding)
258
259        self._overlay = OverlayWidget(self)
260        self._overlay.setWidget(self._optionswidget.dataview)
261        self._overlay.setLayout(QVBoxLayout())
262        self._overlay.layout().addWidget(QLabel(wordWrap=True))
263        self._overlay.hide()
264
265    def setOptions(self, options):
266        # type: (Options) -> None
267        self._options = options
268        self._optionswidget.setEncoding(options.encoding)
269        self._optionswidget.setDialect(options.dialect)
270        self._optionswidget.setNumbersFormat(
271            options.group_separator, options.decimal_separator)
272        self._optionswidget.setColumnTypeRanges(options.columntypes)
273        self._optionswidget.setRowStates(
274            {i: v for r, v in options.rowspec for i in r}
275        )
276
277    def options(self):
278        # type: () -> Options
279        rowspec_ = self._optionswidget.rowStates()
280        rowspec = [(range(i, i + 1), v) for i, v in rowspec_.items()]
281        numformat = self._optionswidget.numbersFormat()
282        return Options(
283            encoding=self._optionswidget.encoding(),
284            dialect=self._optionswidget.dialect(),
285            columntypes=self._optionswidget.columnTypeRanges(),
286            rowspec=rowspec,
287            decimal_separator=numformat["decimal"],
288            group_separator=numformat["group"],
289        )
290
291    def setPath(self, path):
292        """
293        Set the preview path.
294        """
295        if self._path != path:
296            self._path = path
297            self.__update_preview()
298
299    def path(self):
300        """Return the preview path"""
301        return self._path
302
303    def reset(self):
304        """
305        Reset the options to the state previously set with `setOptions`
306        effectively undoing any user modifications since then.
307        """
308        self.setOptions(self._options)
309
310    def restoreDefaults(self):
311        """
312        Restore the options to default state.
313        """
314        # preserve `_options` if set by clients (for `reset`).
315        opts = self._options
316        self.setOptions(Options("utf-8", csv.excel()))
317        self._options = opts
318
319    def __update_preview(self):
320        if not self._path:
321            return
322        try:
323            f = _open(self._path, "rb")
324        except OSError as err:
325            traceback.print_exc(file=sys.stderr)
326            fmt = "".join(traceback.format_exception_only(type(err), err))
327            self.__set_error(fmt)
328        else:
329            self.__clear_error()
330            self._optionswidget.setSampleContents(f)
331            closeexisting = self.__finalizer
332            if closeexisting is not None:
333                self.destroyed.disconnect(closeexisting)
334                closeexisting()
335            self.__finalizer = weakref.finalize(self, f.close)
336            self.destroyed.connect(self.__finalizer)
337
338    def __set_error(self, text, format=Qt.PlainText):
339        self._optionswidget.setEnabled(False)
340        label = self._overlay.findChild(QLabel)  # type: QLabel
341        label.setText(text)
342        label.setTextFormat(format)
343        self._overlay.show()
344        self._overlay.raise_()
345        dialog_button_box_set_enabled(self._buttons, False)
346
347    def __clear_error(self):
348        if self._overlay.isVisibleTo(self):
349            self._overlay.hide()
350            self._optionswidget.setEnabled(True)
351
352    # Enable/disable the accept buttons on the most egregious errors.
353    def __on_preview_error(self):
354        b = self._buttons.button(QDialogButtonBox.Ok)
355        b.setEnabled(False)
356
357    def __on_preview_reset(self):
358        b = self._buttons.button(QDialogButtonBox.Ok)
359        b.setEnabled(True)
360
361
362def dialog_button_box_set_enabled(buttonbox, enabled):
363    # type: (QDialogButtonBox, bool) -> None
364    """
365    Disable/enable buttons in a QDialogButtonBox based on their role.
366
367    All buttons except the ones with RejectRole or HelpRole are disabled.
368    """
369    stashname = "__p_dialog_button_box_set_enabled"
370    for b in buttonbox.buttons():
371        role = buttonbox.buttonRole(b)
372        if not enabled:
373            if b.property(stashname) is None:
374                b.setProperty(stashname, b.isEnabledTo(buttonbox))
375            b.setEnabled(
376                role == QDialogButtonBox.RejectRole or
377                role == QDialogButtonBox.HelpRole
378            )
379        else:
380            stashed_state = b.property(stashname)
381            if isinstance(stashed_state, bool):
382                state = stashed_state
383                b.setProperty(stashname, None)
384            else:
385                state = True
386            b.setEnabled(state)
387
388
389def icon_for_path(path: str) -> QIcon:
390    iconprovider = QFileIconProvider()
391    finfo = QFileInfo(path)
392    if finfo.exists():
393        return iconprovider.icon(finfo)
394    else:
395        return iconprovider.icon(QFileIconProvider.File)
396
397
398class VarPathItem(QStandardItem):
399    PathRole = Qt.UserRole + 4502
400    VarPathRole = PathRole + 1
401
402    def path(self) -> str:
403        """Return the resolved path or '' if unresolved or missing"""
404        path = self.data(VarPathItem.PathRole)
405        return path if isinstance(path, str) else ""
406
407    def setPath(self, path: str) -> None:
408        """Set absolute path."""
409        self.setData(PathItem.AbsPath(path), VarPathItem.VarPathRole)
410
411    def varPath(self) -> Optional[PathItem]:
412        vpath = self.data(VarPathItem.VarPathRole)
413        return vpath if isinstance(vpath, PathItem) else None
414
415    def setVarPath(self, vpath: PathItem) -> None:
416        """Set variable path item."""
417        self.setData(vpath, VarPathItem.VarPathRole)
418
419    def resolve(self, vpath: PathItem) -> Optional[str]:
420        """
421        Resolve `vpath` item. This implementation dispatches to parent model's
422        (:func:`VarPathItemModel.resolve`)
423        """
424        model = self.model()
425        if isinstance(model, VarPathItemModel):
426            return model.resolve(vpath)
427        else:
428            return vpath.resolve({})
429
430    def data(self, role=Qt.UserRole + 1) -> Any:
431        if role == Qt.DisplayRole:
432            value = super().data(role)
433            if value is not None:
434                return value
435            vpath = self.varPath()
436            if isinstance(vpath, PathItem.AbsPath):
437                return os.path.basename(vpath.path)
438            elif isinstance(vpath, PathItem.VarPath):
439                return os.path.basename(vpath.relpath)
440            else:
441                return None
442        elif role == Qt.DecorationRole:
443            return icon_for_path(self.path())
444        elif role == VarPathItem.PathRole:
445            vpath = self.data(VarPathItem.VarPathRole)
446            if isinstance(vpath, PathItem.AbsPath):
447                return vpath.path
448            elif isinstance(vpath, VarPath):
449                path = self.resolve(vpath)
450                if path is not None:
451                    return path
452            return super().data(role)
453        elif role == Qt.ToolTipRole:
454            vpath = self.data(VarPathItem.VarPathRole)
455            if isinstance(vpath, VarPath.AbsPath):
456                return vpath.path
457            elif isinstance(vpath, VarPath):
458                text = f"${{{vpath.name}}}/{vpath.relpath}"
459                p = self.resolve(vpath)
460                if p is None or not os.path.exists(p):
461                    text += " (missing)"
462                return text
463        elif role == Qt.ForegroundRole:
464            vpath = self.data(VarPathItem.VarPathRole)
465            if isinstance(vpath, PathItem):
466                p = self.resolve(vpath)
467                if p is None or not os.path.exists(p):
468                    return QColor(Qt.red)
469        return super().data(role)
470
471
472class ImportItem(VarPathItem):
473    """
474    An item representing a file path and associated load options
475    """
476    OptionsRole = Qt.UserRole + 14
477    IsSessionItemRole = Qt.UserRole + 15
478
479    def options(self) -> Optional[Options]:
480        options = self.data(ImportItem.OptionsRole)
481        return options if isinstance(options, Options) else None
482
483    def setOptions(self, options: Options) -> None:
484        self.setData(options, ImportItem.OptionsRole)
485
486    def setIsSessionItem(self, issession: bool) -> None:
487        self.setData(issession, ImportItem.IsSessionItemRole)
488
489    def isSessionItem(self) -> bool:
490        return bool(self.data(ImportItem.IsSessionItemRole))
491
492    @classmethod
493    def fromPath(cls, path: Union[str, PathItem]) -> 'ImportItem':
494        """
495        Create a `ImportItem` from a local file system path.
496        """
497        if isinstance(path, str):
498            path = PathItem.AbsPath(path)
499        if isinstance(path, PathItem.VarPath):
500            basename = os.path.basename(path.relpath)
501            text = f"${{{path.name}}}/{path.relpath}"
502        elif isinstance(path, PathItem.AbsPath):
503            basename = os.path.basename(path.path)
504            text = path.path
505        else:
506            raise TypeError
507
508        item = cls()
509        item.setText(basename)
510        item.setToolTip(text)
511        item.setData(path, ImportItem.VarPathRole)
512        return item
513
514
515class VarPathItemModel(QStandardItemModel):
516    def __init__(self, *args, replacementEnv=types.MappingProxyType({}),
517                 **kwargs):
518        self.__replacements = types.MappingProxyType(dict(replacementEnv))
519        super().__init__(*args, **kwargs)
520
521    def setReplacementEnv(self, env: Mapping[str, str]) -> None:
522        self.__replacements = types.MappingProxyType(dict(env))
523        self.dataChanged.emit(
524            self.index(0, 0),
525            self.index(self.rowCount() - 1, self.columnCount() - 1)
526        )
527
528    def replacementEnv(self) -> Mapping[str, str]:
529        return self.__replacements
530
531    def resolve(self, vpath: PathItem) -> Optional[str]:
532        return vpath.resolve(self.replacementEnv())
533
534
535def move_item_to_index(model: QStandardItemModel, item: QStandardItem, index: int):
536    if item.row() == index:
537        return
538    assert item.model() is model
539    [item_] = model.takeRow(item.row())
540    assert item_ is item
541    model.insertRow(index, [item])
542
543
544class FileFormat(NamedTuple):
545    mime_type: str
546    name: str
547    globs: Sequence[str]
548
549
550FileFormats = [
551    FileFormat("text/csv", "Text - comma separated", ("*.csv", "*")),
552    FileFormat("text/tab-separated-values", "Text - tab separated", ("*.tsv", "*")),
553    FileFormat("text/plain", "Text - all files", ("*.txt", "*")),
554]
555
556
557class FileDialog(QFileDialog):
558    __formats: Sequence[FileFormat] = ()
559
560    @staticmethod
561    def filterStr(f: FileFormat) -> str:
562        return f"{f.name} ({', '.join(f.globs)})"
563
564    def setFileFormats(self, formats: Sequence[FileFormat]):
565        filters = [FileDialog.filterStr(f) for f in formats]
566        self.__formats = tuple(formats)
567        self.setNameFilters(filters)
568
569    def fileFormats(self) -> Sequence[FileFormat]:
570        return self.__formats
571
572    def selectedFileFormat(self) -> FileFormat:
573        filter_ = self.selectedNameFilter()
574        index = index_where(
575            self.__formats, lambda f: FileDialog.filterStr(f) == filter_
576        )
577        return self.__formats[index]
578
579
580def default_options_for_mime_type(
581        path: str, mime_type: str
582) -> Options:
583    defaults = {
584        "text/csv": (csv.excel(), True),
585        "text/tab-separated-values": (csv.excel_tab(), True)
586    }
587    dialect, header, encoding = csv.excel(), True, "utf-8"
588    delimiters = None
589    try_encodings = ["utf-8", "utf-16", "iso8859-1"]
590    if mime_type in defaults:
591        dialect, header = defaults[mime_type]
592        delimiters = [dialect.delimiter]
593
594    for encoding_ in try_encodings:
595        try:
596            dialect, header = sniff_csv_with_path(
597                path, encoding=encoding_, delimiters=delimiters)
598            encoding = encoding_
599        except (OSError, UnicodeError, csv.Error):
600            pass
601        else:
602            break
603    if header:
604        rowspec = [(range(0, 1), RowSpec.Header)]
605    else:
606        rowspec = []
607    return Options(dialect=dialect, encoding=encoding, rowspec=rowspec)
608
609
610class OWCSVFileImport(widget.OWWidget):
611    name = "CSV File Import"
612    description = "Import a data table from a CSV formatted file."
613    icon = "icons/CSVFile.svg"
614    priority = 11
615    category = "Data"
616    keywords = ["file", "load", "read", "open", "csv"]
617
618    class Outputs:
619        data = widget.Output(
620            name="Data",
621            type=Orange.data.Table,
622            doc="Loaded data set.")
623        data_frame = widget.Output(
624            name="Data Frame",
625            type=pd.DataFrame,
626            doc="",
627            auto_summary=False
628        )
629
630    class Error(widget.OWWidget.Error):
631        error = widget.Msg(
632            "Unexpected error"
633        )
634        encoding_error = widget.Msg(
635            "Encoding error\n"
636            "The file might be encoded in an unsupported encoding or it "
637            "might be binary"
638        )
639
640    #: Paths and options of files accessed in a 'session'
641    _session_items = settings.Setting(
642        [], schema_only=True)  # type: List[Tuple[str, dict]]
643
644    _session_items_v2 = settings.Setting(
645        [], schema_only=True)  # type: List[Tuple[Dict[str, str], dict]]
646    #: Saved dialog state (last directory and selected filter)
647    dialog_state = settings.Setting({
648        "directory": "",
649        "filter": ""
650    })  # type: Dict[str, str]
651
652    # we added column type guessing to this widget, which breaks compatibility
653    # with older saved workflows, where types not guessed differently, when
654    # compatibility_mode=True widget have older guessing behaviour
655    settings_version = 3
656    compatibility_mode = settings.Setting(False, schema_only=True)
657
658    MaxHistorySize = 50
659
660    want_main_area = False
661    buttons_area_orientation = None
662    resizing_enabled = False
663
664    def __init__(self, *args, **kwargs):
665        super().__init__(self, *args, **kwargs)
666        self.settingsAboutToBePacked.connect(self._saveState)
667
668        self.__committimer = QTimer(self, singleShot=True)
669        self.__committimer.timeout.connect(self.commit)
670
671        self.__executor = qconcurrent.ThreadExecutor()
672        self.__watcher = None  # type: Optional[qconcurrent.FutureWatcher]
673
674        self.controlArea.layout().setSpacing(-1)  # reset spacing
675        grid = QGridLayout()
676        grid.addWidget(QLabel("File:", self), 0, 0, 1, 1)
677
678        self.import_items_model = VarPathItemModel(self)
679        self.import_items_model.setReplacementEnv(self._replacements())
680        self.recent_combo = ItemStyledComboBox(
681            self, objectName="recent-combo", toolTip="Recent files.",
682            sizeAdjustPolicy=QComboBox.AdjustToMinimumContentsLengthWithIcon,
683            minimumContentsLength=16, placeholderText="Recent files…"
684        )
685        self.recent_combo.setModel(self.import_items_model)
686        self.recent_combo.activated.connect(self.activate_recent)
687        self.recent_combo.setSizePolicy(
688            QSizePolicy.MinimumExpanding, QSizePolicy.Fixed)
689        self.browse_button = QPushButton(
690            "…", icon=self.style().standardIcon(QStyle.SP_DirOpenIcon),
691            toolTip="Browse filesystem", autoDefault=False,
692        )
693        # A button drop down menu with selection of explicit workflow dir
694        # relative import. This is only enabled when 'basedir' workflow env
695        # is set. XXX: Always use menu, disable Import relative... action?
696        self.browse_menu = menu = QMenu(self.browse_button)
697        ac = menu.addAction("Import any file…")
698        ac.triggered.connect(self.browse)
699
700        ac = menu.addAction("Import relative to workflow file…")
701        ac.setToolTip("Import a file within the workflow file directory")
702        ac.triggered.connect(lambda: self.browse_relative("basedir"))
703
704        if "basedir" in self._replacements():
705            self.browse_button.setMenu(menu)
706
707        self.browse_button.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
708        self.browse_button.clicked.connect(self.browse)
709        grid.addWidget(self.recent_combo, 0, 1, 1, 1)
710        grid.addWidget(self.browse_button, 0, 2, 1, 1)
711        self.controlArea.layout().addLayout(grid)
712
713        ###########
714        # Info text
715        ###########
716        box = gui.widgetBox(self.controlArea, "Info")
717        self.summary_text = QTextBrowser(
718            verticalScrollBarPolicy=Qt.ScrollBarAsNeeded,
719            readOnly=True,
720        )
721        self.summary_text.viewport().setBackgroundRole(QPalette.NoRole)
722        self.summary_text.setFrameStyle(QTextBrowser.NoFrame)
723        self.summary_text.setMinimumHeight(self.fontMetrics().ascent() * 2 + 4)
724        self.summary_text.viewport().setAutoFillBackground(False)
725        box.layout().addWidget(self.summary_text)
726
727        button_box = QDialogButtonBox(
728            orientation=Qt.Horizontal,
729            standardButtons=QDialogButtonBox.Cancel | QDialogButtonBox.Retry
730        )
731        self.load_button = b = button_box.button(QDialogButtonBox.Retry)
732        b.setText("Load")
733        b.clicked.connect(self.__committimer.start)
734        b.setEnabled(False)
735        b.setDefault(True)
736
737        self.cancel_button = b = button_box.button(QDialogButtonBox.Cancel)
738        b.clicked.connect(self.cancel)
739        b.setEnabled(False)
740        b.setAutoDefault(False)
741
742        self.import_options_button = QPushButton(
743            "Import Options…", enabled=False, autoDefault=False,
744            clicked=self._activate_import_dialog
745        )
746
747        def update_buttons(cbindex):
748            self.import_options_button.setEnabled(cbindex != -1)
749            self.load_button.setEnabled(cbindex != -1)
750        self.recent_combo.currentIndexChanged.connect(update_buttons)
751
752        button_box.addButton(
753            self.import_options_button, QDialogButtonBox.ActionRole
754        )
755        button_box.setStyleSheet(
756            "button-layout: {:d};".format(QDialogButtonBox.MacLayout)
757        )
758        self.controlArea.layout().addWidget(button_box)
759        self.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Maximum)
760
761        self._restoreState()
762        item = self.current_item()
763        if item is not None:
764            self._invalidate()
765
766    def workflowEnvChanged(self, key, value, oldvalue):
767        super().workflowEnvChanged(key, value, oldvalue)
768        if key == "basedir":
769            self.browse_button.setMenu(self.browse_menu)
770            self.import_items_model.setReplacementEnv(self._replacements())
771
772    @Slot(int)
773    def activate_recent(self, index):
774        """
775        Activate an item from the recent list.
776        """
777        model = self.import_items_model
778        cb = self.recent_combo
779        if 0 <= index < model.rowCount():
780            item = model.item(index)
781            assert isinstance(item, ImportItem)
782            path = item.path()
783            item.setData(True, ImportItem.IsSessionItemRole)
784            move_item_to_index(model, item, 0)
785            if not os.path.exists(path):
786                self._browse_for_missing(
787                    item, onfinished=lambda status: self._invalidate()
788                )
789            else:
790                cb.setCurrentIndex(0)
791                self._invalidate()
792        else:
793            self.recent_combo.setCurrentIndex(-1)
794
795    def _browse_for_missing(
796            self, item: ImportItem, *, onfinished: Optional[Callable[[int], Any]] = None):
797        dlg = self._browse_dialog()
798        model = self.import_items_model
799
800        if onfinished is None:
801            onfinished = lambda status: None
802
803        vpath = item.varPath()
804        prefixpath = None
805        if isinstance(vpath, PathItem.VarPath):
806            prefixpath = self._replacements().get(vpath.name)
807        if prefixpath is not None:
808            dlg.setDirectory(prefixpath)
809        dlg.setAttribute(Qt.WA_DeleteOnClose)
810
811        def accepted():
812            path = dlg.selectedFiles()[0]
813            if isinstance(vpath, VarPath) and not isprefixed(prefixpath, path):
814                mb = self._path_must_be_relative_mb(prefixpath)
815                mb.show()
816                mb.finished.connect(lambda _: onfinished(QDialog.Rejected))
817                return
818
819            # pre-flight check; try to determine the nature of the file
820            mtype = _mime_type_for_path(path)
821            if not mtype.inherits("text/plain"):
822                mb = self._might_be_binary_mb(path)
823                if mb.exec() == QMessageBox.Cancel:
824                    if onfinished:
825                        onfinished(QDialog.Rejected)
826                    return
827
828            if isinstance(vpath, VarPath):
829                vpath_ = VarPath(vpath.name, os.path.relpath(path, prefixpath))
830            else:
831                vpath_ = AbsPath(path)
832            item.setVarPath(vpath_)
833            if item.row() != 0:
834                move_item_to_index(model, item, 0)
835            item.setData(True, ImportItem.IsSessionItemRole)
836            self.set_selected_file(path, item.options())
837            self._note_recent(path, item.options())
838            onfinished(QDialog.Accepted)
839
840        dlg.accepted.connect(accepted)
841        dlg.open()
842
843    def _browse_dialog(self):
844        dlg = FileDialog(
845            self, windowTitle=self.tr("Open Data File"),
846            acceptMode=QFileDialog.AcceptOpen,
847            fileMode=QFileDialog.ExistingFile
848        )
849
850        dlg.setFileFormats(FileFormats)
851        state = self.dialog_state
852        lastdir = state.get("directory", "")
853        lastfilter = state.get("filter", "")
854        if lastdir and os.path.isdir(lastdir):
855            dlg.setDirectory(lastdir)
856        if lastfilter:
857            dlg.selectNameFilter(lastfilter)
858
859        def store_state():
860            state["directory"] = dlg.directory().absolutePath()
861            state["filter"] = dlg.selectedNameFilter()
862        dlg.accepted.connect(store_state)
863        return dlg
864
865    def _might_be_binary_mb(self, path) -> QMessageBox:
866        mb = QMessageBox(
867            parent=self,
868            windowTitle=self.tr(""),
869            icon=QMessageBox.Question,
870            text=self.tr("The '{basename}' may be a binary file.\n"
871                         "Are you sure you want to continue?").format(
872                             basename=os.path.basename(path)),
873            standardButtons=QMessageBox.Cancel | QMessageBox.Yes
874        )
875        mb.setWindowModality(Qt.WindowModal)
876        return mb
877
878    def _path_must_be_relative_mb(self, prefix: str) -> QMessageBox:
879        mb = QMessageBox(
880            parent=self, windowTitle=self.tr("Invalid path"),
881            icon=QMessageBox.Warning,
882            text=self.tr("Selected path is not within '{prefix}'").format(
883                prefix=prefix
884            ),
885        )
886        mb.setAttribute(Qt.WA_DeleteOnClose)
887        return mb
888
889    @Slot(str)
890    def browse_relative(self, prefixname):
891        path = self._replacements().get(prefixname)
892        self.browse(prefixname=prefixname, directory=path)
893
894    @Slot()
895    def browse(self, prefixname=None, directory=None):
896        """
897        Open a file dialog and select a user specified file.
898        """
899        dlg = self._browse_dialog()
900        if directory is not None:
901            dlg.setDirectory(directory)
902
903        status = dlg.exec()
904        dlg.deleteLater()
905        if status == QFileDialog.Accepted:
906            selected_filter = dlg.selectedFileFormat()
907            path = dlg.selectedFiles()[0]
908            if prefixname:
909                _prefixpath = self._replacements().get(prefixname, "")
910                if not isprefixed(_prefixpath, path):
911                    mb = self._path_must_be_relative_mb(_prefixpath)
912                    mb.show()
913                    return
914                varpath = VarPath(prefixname, os.path.relpath(path, _prefixpath))
915            else:
916                varpath = PathItem.AbsPath(path)
917
918            # pre-flight check; try to determine the nature of the file
919            mtype = _mime_type_for_path(path)
920            if not mtype.inherits("text/plain"):
921                mb = self._might_be_binary_mb(path)
922                if mb.exec() == QMessageBox.Cancel:
923                    return
924            # initialize options based on selected format
925            options = default_options_for_mime_type(
926                path, selected_filter.mime_type,
927            )
928            # Search for path in history.
929            # If found use the stored params to initialize the import dialog
930            items = self.itemsFromSettings()
931            idx = index_where(items, lambda t: samepath(t[0], path))
932            if idx is not None:
933                _, options_ = items[idx]
934                if options_ is not None:
935                    options = options_
936            dlg = CSVImportDialog(
937                self, windowTitle="Import Options", sizeGripEnabled=True)
938            dlg.setWindowModality(Qt.WindowModal)
939            dlg.setPath(path)
940            dlg.setOptions(options)
941            status = dlg.exec()
942            dlg.deleteLater()
943            if status == QDialog.Accepted:
944                self.set_selected_file(path, dlg.options())
945                self.current_item().setVarPath(varpath)
946
947    def current_item(self):
948        # type: () -> Optional[ImportItem]
949        """
950        Return the current selected item (file) or None if there is no
951        current item.
952        """
953        idx = self.recent_combo.currentIndex()
954        if idx == -1:
955            return None
956
957        item = self.recent_combo.model().item(idx)  # type: QStandardItem
958        if isinstance(item, ImportItem):
959            return item
960        else:
961            return None
962
963    def _activate_import_dialog(self):
964        """Activate the Import Options dialog for the  current item."""
965        item = self.current_item()
966        assert item is not None
967        dlg = CSVImportDialog(
968            self, windowTitle="Import Options", sizeGripEnabled=True,
969        )
970        dlg.setWindowModality(Qt.WindowModal)
971        dlg.setAttribute(Qt.WA_DeleteOnClose)
972        settings = self._local_settings()
973        settings.beginGroup(qname(type(dlg)))
974        size = settings.value("size", QSize(), type=QSize)  # type: QSize
975        if size.isValid():
976            dlg.resize(size)
977
978        path = item.data(ImportItem.PathRole)
979        options = item.data(ImportItem.OptionsRole)
980        dlg.setPath(path)  # Set path before options so column types can
981        if isinstance(options, Options):
982            dlg.setOptions(options)
983
984        def update():
985            newoptions = dlg.options()
986            item.setData(newoptions, ImportItem.OptionsRole)
987            # update local recent paths list
988            self._note_recent(path, newoptions)
989            if newoptions != options:
990                self._invalidate()
991        dlg.accepted.connect(update)
992
993        def store_size():
994            settings.setValue("size", dlg.size())
995        dlg.finished.connect(store_size)
996        dlg.show()
997
998    def set_selected_file(self, filename, options=None):
999        """
1000        Set the current selected filename path.
1001        """
1002        self._add_recent(filename, options)
1003        self._invalidate()
1004
1005    #: Saved options for a filename
1006    SCHEMA = {
1007        "path": str,  # Local filesystem path
1008        "options": str,  # json encoded 'Options'
1009    }
1010
1011    @classmethod
1012    def _local_settings(cls):
1013        # type: () -> QSettings
1014        """Return a QSettings instance with local persistent settings."""
1015        filename = "{}.ini".format(qname(cls))
1016        fname = os.path.join(settings.widget_settings_dir(), filename)
1017        return QSettings(fname, QSettings.IniFormat)
1018
1019    def _add_recent(self, filename, options=None):
1020        # type: (str, Optional[Options]) -> None
1021        """
1022        Add filename to the list of recent files.
1023        """
1024        model = self.import_items_model
1025        index = index_where(
1026            (model.index(i, 0).data(ImportItem.PathRole)
1027             for i in range(model.rowCount())),
1028            lambda path: isinstance(path, str) and samepath(path, filename)
1029        )
1030        if index is not None:
1031            item, *_ = model.takeRow(index)
1032        else:
1033            item = ImportItem.fromPath(filename)
1034
1035        # item.setData(VarPath(filename), ImportItem.VarPathRole)
1036        item.setData(True, ImportItem.IsSessionItemRole)
1037        model.insertRow(0, item)
1038
1039        if options is not None:
1040            item.setOptions(options)
1041
1042        self.recent_combo.setCurrentIndex(0)
1043
1044        if not os.path.exists(filename):
1045            return
1046        self._note_recent(filename, options)
1047
1048    def _note_recent(self, filename, options):
1049        # store item to local persistent settings
1050        s = self._local_settings()
1051        arr = QSettings_readArray(s, "recent", OWCSVFileImport.SCHEMA)
1052        item = {"path": filename}
1053        if options is not None:
1054            item["options"] = json.dumps(options.as_dict())
1055        arr = [item for item in arr if not samepath(item.get("path"), filename)]
1056        arr.append(item)
1057        QSettings_writeArray(s, "recent", arr)
1058
1059    def _invalidate(self):
1060        # Invalidate the current output and schedule a new commit call.
1061        # (NOTE: The widget enters a blocking state)
1062        self.__committimer.start()
1063        if self.__watcher is not None:
1064            self.__cancel_task()
1065        self.setBlocking(True)
1066
1067    def commit(self):
1068        """
1069        Commit the current state and submit the load task for execution.
1070
1071        Note
1072        ----
1073        Any existing pending task is canceled.
1074        """
1075        self.__committimer.stop()
1076        if self.__watcher is not None:
1077            self.__cancel_task()
1078        self.error()
1079
1080        item = self.current_item()
1081        if item is None:
1082            return
1083        path = item.path()
1084        opts = item.options()
1085        if not isinstance(opts, Options):
1086            return
1087
1088        task = state = TaskState()
1089        state.future = ...
1090        state.watcher = qconcurrent.FutureWatcher()
1091        state.progressChanged.connect(
1092            self.__set_read_progress, Qt.DirectConnection)
1093
1094        def progress_(i, j):
1095            task.emitProgressChangedOrCancel(i, j)
1096
1097        task.future = self.__executor.submit(
1098            clear_stack_on_cancel(load_csv),
1099            path, opts, progress_, self.compatibility_mode
1100        )
1101        task.watcher.setFuture(task.future)
1102        w = task.watcher
1103        w.done.connect(self.__handle_result)
1104        w.progress = state
1105        self.__watcher = w
1106        self.__set_running_state()
1107
1108    @Slot('qint64', 'qint64')
1109    def __set_read_progress(self, read, count):
1110        if count > 0:
1111            self.progressBarSet(100 * read / count)
1112
1113    def __cancel_task(self):
1114        # Cancel and dispose of the current task
1115        assert self.__watcher is not None
1116        w = self.__watcher
1117        w.future().cancel()
1118        w.progress.cancel = True
1119        w.done.disconnect(self.__handle_result)
1120        w.progress.progressChanged.disconnect(self.__set_read_progress)
1121        self.__watcher = None
1122
1123    def cancel(self):
1124        """
1125        Cancel current pending or executing task.
1126        """
1127        if self.__watcher is not None:
1128            self.__cancel_task()
1129            self.__clear_running_state()
1130            self.setStatusMessage("Cancelled")
1131            self.summary_text.setText(
1132                "<div>Cancelled<br/><small>Press 'Reload' to try again</small></div>"
1133            )
1134
1135    def __set_running_state(self):
1136        self.progressBarInit()
1137        self.setBlocking(True)
1138        self.setStatusMessage("Running")
1139        self.cancel_button.setEnabled(True)
1140        self.load_button.setText("Restart")
1141        path = self.current_item().path()
1142        self.Error.clear()
1143        self.summary_text.setText(
1144            "<div>Loading: <i>{}</i><br/>".format(prettyfypath(path))
1145        )
1146
1147    def __clear_running_state(self, ):
1148        self.progressBarFinished()
1149        self.setStatusMessage("")
1150        self.setBlocking(False)
1151        self.cancel_button.setEnabled(False)
1152        self.load_button.setText("Reload")
1153
1154    def __set_error_state(self, err):
1155        self.Error.clear()
1156        if isinstance(err, UnicodeDecodeError):
1157            self.Error.encoding_error(exc_info=err)
1158        else:
1159            self.Error.error(exc_info=err)
1160
1161        path = self.current_item().path()
1162        basename = os.path.basename(path)
1163        if isinstance(err, UnicodeDecodeError):
1164            text = (
1165                "<div><i>{basename}</i> was not loaded due to a text encoding "
1166                "error. The file might be saved in an unknown or invalid "
1167                "encoding, or it might be a binary file.</div>"
1168            ).format(
1169                basename=escape(basename)
1170            )
1171        else:
1172            text = (
1173                "<div><i>{basename}</i> was not loaded due to an error:"
1174                "<p style='white-space: pre;'>{err}</p>"
1175            ).format(
1176                basename=escape(basename),
1177                err="".join(traceback.format_exception_only(type(err), err))
1178            )
1179        self.summary_text.setText(text)
1180
1181    def __clear_error_state(self):
1182        self.Error.error.clear()
1183        self.summary_text.setText("")
1184
1185    def onDeleteWidget(self):
1186        """Reimplemented."""
1187        if self.__watcher is not None:
1188            self.__cancel_task()
1189            self.__executor.shutdown()
1190        super().onDeleteWidget()
1191
1192    @Slot(object)
1193    def __handle_result(self, f):
1194        # type: (qconcurrent.Future[pd.DataFrame]) -> None
1195        assert f.done()
1196        assert f is self.__watcher.future()
1197        self.__watcher = None
1198        self.__clear_running_state()
1199
1200        try:
1201            df = f.result()
1202            assert isinstance(df, pd.DataFrame)
1203        except pandas.errors.EmptyDataError:
1204            df = pd.DataFrame({})
1205        except Exception as e:  # pylint: disable=broad-except
1206            self.__set_error_state(e)
1207            df = None
1208        else:
1209            self.__clear_error_state()
1210
1211        if df is not None:
1212            table = pandas_to_table(df)
1213            filename = self.current_item().path()
1214            table.name = os.path.splitext(os.path.split(filename)[-1])[0]
1215        else:
1216            table = None
1217        self.Outputs.data_frame.send(df)
1218        self.Outputs.data.send(table)
1219        self._update_status_messages(table)
1220
1221    def _update_status_messages(self, data):
1222        if data is None:
1223            return
1224
1225        def pluralize(seq):
1226            return "s" if len(seq) != 1 else ""
1227
1228        summary = ("{n_instances} row{plural_1}, "
1229                   "{n_features} feature{plural_2}, "
1230                   "{n_meta} meta{plural_3}").format(
1231                       n_instances=len(data), plural_1=pluralize(data),
1232                       n_features=len(data.domain.attributes),
1233                       plural_2=pluralize(data.domain.attributes),
1234                       n_meta=len(data.domain.metas),
1235                       plural_3=pluralize(data.domain.metas))
1236        self.summary_text.setText(summary)
1237
1238    def itemsFromSettings(self):
1239        # type: () -> List[Tuple[str, Options]]
1240        """
1241        Return items from local history.
1242        """
1243        s = self._local_settings()
1244        items_ = QSettings_readArray(s, "recent", OWCSVFileImport.SCHEMA)
1245        items = []  # type: List[Tuple[str, Options]]
1246        for item in items_:
1247            path = item.get("path", "")
1248            if not path:
1249                continue
1250            opts_json = item.get("options", "")
1251            try:
1252                opts = Options.from_dict(json.loads(opts_json))
1253            except (csv.Error, LookupError, TypeError, json.JSONDecodeError):
1254                _log.error("Could not reconstruct options for '%s'", path,
1255                           exc_info=True)
1256            else:
1257                items.append((path, opts))
1258        return items[::-1]
1259
1260    def _replacements(self) -> Mapping[str, str]:
1261        replacements = []
1262        basedir = self.workflowEnv().get("basedir", None)
1263        if basedir is not None:
1264            replacements += [('basedir', basedir)]
1265        return dict(replacements)
1266
1267    def _saveState(self):
1268        session_items = []
1269        model = self.import_items_model
1270        for item in map(model.item, range(model.rowCount())):
1271            if isinstance(item, ImportItem) and item.data(ImportItem.IsSessionItemRole):
1272                vp = item.data(VarPathItem.VarPathRole)
1273                session_items.append((vp.as_dict(), item.options().as_dict()))
1274        self._session_items_v2 = session_items
1275
1276    def _restoreState(self):
1277        # Restore the state. Merge session (workflow) items with the
1278        # local history.
1279        model = self.import_items_model
1280        model.setReplacementEnv(self._replacements())
1281
1282        # local history
1283        items = self.itemsFromSettings()
1284        # stored session items
1285        sitems = []
1286        # replacements = self._replacements()
1287        for p, m in self._session_items_v2:
1288            try:
1289                p, m = (PathItem.from_dict(p), Options.from_dict(m))
1290            except (csv.Error, LookupError, ValueError):
1291                _log.error("Failed to restore '%s'", p, exc_info=True)
1292            else:
1293                sitems.append((p, m, True))
1294
1295        items = sitems + [(PathItem.AbsPath(p), m, False) for p, m in items]
1296        items = unique_everseen(items, key=lambda t: t[0])
1297        curr = self.recent_combo.currentIndex()
1298        if curr != -1:
1299            currentpath = self.recent_combo.currentData(ImportItem.PathRole)
1300        else:
1301            currentpath = None
1302
1303        for path, options, is_session in items:
1304            item = ImportItem.fromPath(path)
1305            item.setOptions(options)
1306            item.setData(is_session, ImportItem.IsSessionItemRole)
1307            model.appendRow(item)
1308
1309        if currentpath:
1310            idx = self.recent_combo.findData(currentpath, ImportItem.PathRole)
1311        elif model.data(model.index(0, 0), ImportItem.IsSessionItemRole):
1312            # restore last (current) session item
1313            idx = 0
1314        else:
1315            idx = -1
1316        self.recent_combo.setCurrentIndex(idx)
1317
1318    @classmethod
1319    def migrate_settings(cls, settings, version):
1320        if not version or version < 2:
1321            settings["compatibility_mode"] = True
1322
1323        if version is not None and version < 3:
1324            items_ = settings.pop("_session_items", [])
1325            items_v2 = [(PathItem.AbsPath(p).as_dict(), m) for p, m in items_]
1326            settings["_session_items_v2"] = items_v2
1327
1328
1329@singledispatch
1330def sniff_csv(file, samplesize=2 ** 20, delimiters=None):
1331    sniffer = csv.Sniffer()
1332    sample = file.read(samplesize)
1333    dialect = sniffer.sniff(sample, delimiters=delimiters)
1334    dialect = textimport.Dialect(
1335        dialect.delimiter, dialect.quotechar,
1336        dialect.escapechar, dialect.doublequote,
1337        dialect.skipinitialspace, dialect.quoting
1338    )
1339    has_header = HeaderSniffer(dialect).has_header(sample)
1340    return dialect, has_header
1341
1342
1343class HeaderSniffer(csv.Sniffer):
1344    def __init__(self, dialect: csv.Dialect):
1345        super().__init__()
1346        self.dialect = dialect
1347
1348    def sniff(self, *_args, **_kwargs):  # pylint: disable=signature-differs
1349        # return fixed constant dialect, has_header sniffs dialect itself,
1350        # so it can't detect headers for a predefined dialect
1351        return self.dialect
1352
1353
1354@sniff_csv.register(str)
1355@sniff_csv.register(bytes)
1356def sniff_csv_with_path(path, encoding="utf-8", samplesize=2 ** 20, delimiters=None):
1357    with _open(path, "rt", encoding=encoding) as f:
1358        return sniff_csv(f, samplesize, delimiters)
1359
1360
1361def _open(path, mode, encoding=None):
1362    # type: (str, str, Optional[str]) -> typing.IO[Any]
1363    """
1364    Open a local file `path` for reading. The file may be gzip, bz2 or zip
1365    compressed.
1366
1367    If a zip archive then a single archive member is expected.
1368
1369    Parameters
1370    ----------
1371    path : str
1372        File system path
1373    mode : str
1374        'r', 'rb' or 'rt'
1375    encoding : Optional[str]
1376        Optional text encoding, for opening in text mode.
1377
1378    Returns
1379    -------
1380    stream: io.BaseIO
1381        A stream opened for reading.
1382    """
1383    if mode not in {'r', 'rb', 'rt'}:
1384        raise ValueError('r')
1385    _, ext = os.path.splitext(path)
1386    ext = ext.lower()
1387    if ext == ".gz":
1388        return gzip.open(path, mode, encoding=encoding)
1389    elif ext == ".bz2":
1390        return bz2.open(path, mode, encoding=encoding)
1391    elif ext == ".xz":
1392        return lzma.open(path, mode, encoding=encoding)
1393    elif ext == ".zip":
1394        arh = zipfile.ZipFile(path, 'r')
1395        filelist = arh.infolist()
1396        if len(filelist) == 1:
1397            f = arh.open(filelist[0], 'r')
1398            # patch the f.close to also close the main archive file
1399            f_close = f.close
1400
1401            def close_():
1402                f_close()
1403                arh.close()
1404            f.close = close_
1405            if 't' in mode:
1406                f = io.TextIOWrapper(f, encoding=encoding)
1407            return f
1408        else:
1409            raise ValueError("Expected a single file in the archive.")
1410    else:
1411        return open(path, mode, encoding=encoding)
1412
1413
1414compression_types = [
1415    "application/gzip", "application/zip",
1416    "application/x-xz", "application/x-bzip",
1417    # application/x-lz4
1418]
1419
1420
1421def _mime_type_for_path(path):
1422    # type: (str) -> QMimeType
1423    """
1424    Return the mime type of the file on a local filesystem.
1425
1426    In case the path is a compressed file return the mime type of its contents
1427
1428    Parameters
1429    ----------
1430    path : str
1431        Local filesystem path
1432
1433    Returns
1434    -------
1435    mimetype: QMimeType
1436    """
1437    db = QMimeDatabase()
1438    mtype = db.mimeTypeForFile(path, QMimeDatabase.MatchDefault)
1439    if any(mtype.inherits(t) for t in compression_types):
1440        # peek contents
1441        try:
1442            with _open(path, "rb") as f:
1443                sample = f.read(4096)
1444        except Exception:  # pylint: disable=broad-except
1445            sample = b''
1446        mtype = db.mimeTypeForData(sample)
1447    return mtype
1448
1449
1450NA_DEFAULT = ["", "?", ".", "~", "nan", "NAN", "NaN", "N/A", "n/a", "NA"]
1451
1452NA_VALUES = {
1453    ColumnType.Numeric: NA_DEFAULT,
1454    ColumnType.Categorical: NA_DEFAULT,
1455    ColumnType.Time: NA_DEFAULT + ["NaT", "NAT"],
1456    ColumnType.Text: [],
1457    ColumnType.Auto: NA_DEFAULT,
1458}
1459
1460
1461def load_csv(path, opts, progress_callback=None, compatibility_mode=False):
1462    # type: (Union[AnyStr, BinaryIO], Options, Optional[Callable[[int, int], None]], bool) -> pd.DataFrame
1463    def dtype(coltype):
1464        # type: (ColumnType) -> Optional[str]
1465        if coltype == ColumnType.Numeric:
1466            return "float"
1467        elif coltype == ColumnType.Categorical:
1468            return "category"
1469        elif coltype == ColumnType.Time:
1470            return "object"
1471        elif coltype == ColumnType.Text:
1472            return "object"
1473        elif coltype == ColumnType.Skip:
1474            return None
1475        elif coltype == ColumnType.Auto:
1476            return None
1477        else:
1478            raise TypeError
1479
1480    def expand(ranges):
1481        # type: (Iterable[Tuple[range, T]]) -> Iterable[Tuple[int, T]]
1482        return ((i, x) for r, x in ranges for i in r)
1483
1484    dtypes = {i: dtype(c) for i, c in expand(opts.columntypes)}
1485    dtypes = {i: dtp for i, dtp in dtypes.items()
1486              if dtp is not None and dtp != ColumnType.Auto}
1487
1488    columns_ignored = {i for i, c in expand(opts.columntypes)
1489                       if c == ColumnType.Skip}
1490    dtcols = {i for i, c in expand(opts.columntypes)
1491              if c == ColumnType.Time}
1492    parse_dates = sorted(dtcols)
1493    na_values = {i: NA_VALUES.get(c, NA_DEFAULT)
1494                 for i, c in expand(opts.columntypes)}
1495    if not parse_dates:
1496        parse_dates = False
1497
1498    # fixup header indices to account for skipped rows (header row indices
1499    # pick rows after skiprows)
1500
1501    hspec = sorted(opts.rowspec, key=lambda t: t[0].start)
1502    header_ranges = []
1503    nskiped = 0
1504    for range_, state in hspec:
1505        if state == RowSpec.Skipped:
1506            nskiped += len(range_)
1507        elif state == RowSpec.Header:
1508            header_ranges.append(
1509                range(range_.start - nskiped, range_.stop - nskiped)
1510            )
1511    headers = [i for r in header_ranges for i in r]
1512    skiprows = [row for r, st in hspec if st == RowSpec.Skipped for row in r]
1513
1514    if not headers:
1515        header = None
1516        prefix = "X."
1517
1518    elif len(headers) == 1:
1519        header = headers[0]
1520        prefix = None
1521    else:
1522        header = headers
1523        prefix = None
1524
1525    if not skiprows:
1526        skiprows = None
1527
1528    numbers_format_kwds = {}
1529
1530    if opts.decimal_separator != ".":
1531        numbers_format_kwds["decimal"] = opts.decimal_separator
1532
1533    if opts.group_separator != "":
1534        numbers_format_kwds["thousands"] = opts.group_separator
1535
1536    if numbers_format_kwds:
1537        # float_precision = "round_trip" cannot handle non c-locale decimal and
1538        # thousands sep (https://github.com/pandas-dev/pandas/issues/35365).
1539        # Fallback to 'high'.
1540        numbers_format_kwds["float_precision"] = "high"
1541    else:
1542        numbers_format_kwds["float_precision"] = "round_trip"
1543
1544    with ExitStack() as stack:
1545        if isinstance(path, (str, bytes)):
1546            f = stack.enter_context(_open(path, 'rb'))
1547        elif isinstance(path, (io.RawIOBase, io.BufferedIOBase)) or \
1548                hasattr(path, "read"):
1549            f = path
1550        else:
1551            raise TypeError()
1552        file = TextReadWrapper(
1553            f, encoding=opts.encoding,
1554            progress_callback=progress_callback)
1555        stack.callback(file.detach)
1556        df = pd.read_csv(
1557            file, sep=opts.dialect.delimiter, dialect=opts.dialect,
1558            skipinitialspace=opts.dialect.skipinitialspace,
1559            header=header, skiprows=skiprows,
1560            dtype=dtypes, parse_dates=parse_dates, prefix=prefix,
1561            na_values=na_values, keep_default_na=False,
1562            **numbers_format_kwds
1563        )
1564
1565        # for older workflows avoid guessing type guessing
1566        if not compatibility_mode:
1567            df = guess_types(df, dtypes, columns_ignored)
1568
1569        if columns_ignored:
1570            # TODO: use 'usecols' parameter in `read_csv` call to
1571            # avoid loading/parsing the columns
1572            df.drop(
1573                columns=[df.columns[i] for i in columns_ignored
1574                         if i < len(df.columns)],
1575                inplace=True
1576            )
1577        return df
1578
1579
1580def guess_types(
1581        df: pd.DataFrame, dtypes: Dict[int, str], columns_ignored: Set[int]
1582) -> pd.DataFrame:
1583    """
1584    Guess data type for variables according to values.
1585
1586    Parameters
1587    ----------
1588    df
1589        Data frame
1590    dtypes
1591        The dictionary with data types set by user. We will guess values only
1592        for columns that does not have data type defined.
1593    columns_ignored
1594        List with indices of ignored columns. Ignored columns are skipped.
1595
1596    Returns
1597    -------
1598    A data frame with changed dtypes according to the strategy.
1599    """
1600    for i, col in enumerate(df):
1601        # only when automatic is set in widget dialog
1602        if dtypes.get(i, None) is None and i not in columns_ignored:
1603            df[col] = guess_data_type(df[col])
1604    return df
1605
1606
1607def guess_data_type(col: pd.Series) -> pd.Series:
1608    """
1609    Guess column types. Logic is same than in guess_data_type from io_utils
1610    module. This function only change the dtype of the column such that later
1611    correct Orange.data.variable is used.
1612    Logic:
1613    - if can converted to date-time (ISO) -> TimeVariable
1614    - if numeric (only numbers)
1615        - only values {0, 1} or {1, 2} -> DiscreteVariable
1616        - else -> ContinuousVariable
1617    - if not numbers:
1618        - num_unique_values < len(data) ** 0.7 and < 100 -> DiscreteVariable
1619        - else -> StringVariable
1620
1621    Parameters
1622    ----------
1623    col
1624        Data column
1625
1626    Returns
1627    -------
1628    Data column with correct dtype
1629    """
1630    def parse_dates(s):
1631        """
1632        This is an extremely fast approach to datetime parsing.
1633        For large data, the same dates are often repeated. Rather than
1634        re-parse these, we store all unique dates, parse them, and
1635        use a lookup to convert all dates.
1636        """
1637        try:
1638            dates = {date: pd.to_datetime(date) for date in s.unique()}
1639        except ValueError:
1640            return None
1641        return s.map(dates)
1642
1643    if pdtypes.is_numeric_dtype(col):
1644        unique_values = col.unique()
1645        if len(unique_values) <= 2 and (
1646                len(np.setdiff1d(unique_values, [0, 1])) == 0
1647                or len(np.setdiff1d(unique_values, [1, 2])) == 0):
1648            return col.astype("category")
1649    else:  # object
1650        # try parse as date - if None not a date
1651        parsed_col = parse_dates(col)
1652        if parsed_col is not None:
1653            return parsed_col
1654        unique_values = col.unique()
1655        if len(unique_values) < 100 and len(unique_values) < len(col)**0.7:
1656            return col.astype("category")
1657    return col
1658
1659
1660def clear_stack_on_cancel(f):
1661    """
1662    A decorator that catches the TaskState.UserCancelException exception
1663    and clears the exception's traceback to remove local references.
1664
1665    Parameters
1666    ----------
1667    f : callable
1668
1669    Returns
1670    -------
1671    wrapped : callable
1672    """
1673    def wrapper(*args, **kwargs):
1674        try:
1675            return f(*args, **kwargs)
1676        except TaskState.UserCancelException as e:
1677            # TODO: Is this enough to allow immediate gc of the stack?
1678            # How does it chain across cython code?
1679            # Maybe just return None.
1680            e = e.with_traceback(None)
1681            e.__context__ = None
1682            e.__cause__ = None
1683            raise e
1684        except BaseException as e:
1685            traceback.clear_frames(e.__traceback__)
1686            raise
1687
1688    return wrapper
1689
1690
1691class TaskState(QObject, PyOwned):
1692    class UserCancelException(BaseException):
1693        """User interrupt exception."""
1694
1695    #: Signal emitted with the current read progress. First value is the current
1696    #: progress state, second value is the total progress to complete
1697    #: (-1 if unknown)
1698    progressChanged = Signal('qint64', 'qint64')
1699    __progressChanged = Signal('qint64', 'qint64')
1700    #: Was cancel requested.
1701    cancel = False
1702
1703    def __init__(self, *args, **kwargs):
1704        super().__init__(*args, **kwargs)
1705        # route the signal via this object's queue
1706        self.__progressChanged.connect(
1707            self.progressChanged, Qt.QueuedConnection)
1708
1709    def emitProgressChangedOrCancel(self, current, total):
1710        # type: (int, int) -> None
1711        """
1712        Emit the progressChanged signal with `current` and `total`.
1713        """
1714        if self.cancel:
1715            raise TaskState.UserCancelException()
1716        else:
1717            self.__progressChanged.emit(current, total)
1718
1719
1720class TextReadWrapper(io.TextIOWrapper):
1721    """
1722    TextIOWrapper reporting the read progress.
1723
1724    Assuming a single forward read pass.
1725    """
1726
1727    #: A `Callable[[int, int], []]` called when the file position is
1728    #: advanced by read; called with current byte position and total
1729    #: file size.
1730    progress_callback = ...  # type: Callable[[int, int], None]
1731
1732    def __init__(self, buffer, *args, progress_callback=None, **kwargs):
1733        super().__init__(buffer, *args, **kwargs)
1734        if progress_callback is None:
1735            def progress_callback(i, j):  # pylint: disable=unused-argument
1736                pass
1737        self.progress_callback = progress_callback
1738        try:
1739            self.__size = os.fstat(buffer.fileno()).st_size
1740        except OSError:
1741            self.__size = -1
1742
1743    def read(self, size=-1):
1744        s = super().read(size)
1745        # try to go around any gzip/bz2/lzma wrappers to the base
1746        # raw file (does not work for zipfile.ZipExtFile; should
1747        # dispatch on buffer type)
1748        try:
1749            fd = self.buffer.fileno()
1750        except (AttributeError, io.UnsupportedOperation):
1751            pos = -1
1752        else:
1753            try:
1754                pos = os.lseek(fd, 0, os.SEEK_CUR)
1755            except OSError:
1756                pos = -1
1757
1758        self.progress_callback(pos, self.__size)
1759        return s
1760
1761
1762def index_where(iterable, pred):
1763    # type: (Iterable[T], Callable[[T], bool]) -> Optional[int]
1764    """
1765    Return the (first) index of el in `iterable` where `pred(el)` returns True.
1766
1767    If no element matches return `None`.
1768    """
1769    for i, el in enumerate(iterable):
1770        if pred(el):
1771            return i
1772    return None
1773
1774
1775def pandas_to_table(df):
1776    # type: (pd.DataFrame) -> Orange.data.Table
1777    """
1778    Convert a pandas.DataFrame to a Orange.data.Table instance.
1779    """
1780    index = df.index
1781    if not isinstance(index, pd.RangeIndex):
1782        df = df.reset_index()
1783
1784    columns = []  # type: List[Tuple[Orange.data.Variable, np.ndarray]]
1785
1786    for header, series in df.items():  # type: (Any, pd.Series)
1787        if pdtypes.is_categorical_dtype(series):
1788            coldata = series.values  # type: pd.Categorical
1789            categories = natural_sorted(str(c) for c in coldata.categories)
1790            var = Orange.data.DiscreteVariable.make(
1791                str(header), values=categories
1792            )
1793            # Remap the coldata into the var.values order/set
1794            coldata = pd.Categorical(
1795                coldata.astype("str"), categories=var.values
1796            )
1797            codes = coldata.codes
1798            assert np.issubdtype(codes.dtype, np.integer)
1799            orangecol = np.array(codes, dtype=np.float)
1800            orangecol[codes < 0] = np.nan
1801        elif pdtypes.is_datetime64_any_dtype(series):
1802            # Check that this converts tz local to UTC
1803            series = series.astype(np.dtype("M8[ns]"))
1804            coldata = series.values  # type: np.ndarray
1805            assert coldata.dtype == "M8[ns]"
1806            mask = np.isnat(coldata)
1807            orangecol = coldata.astype(np.int64) / 10 ** 9
1808            orangecol[mask] = np.nan
1809            var = Orange.data.TimeVariable.make(str(header))
1810            var.have_date = var.have_time = 1
1811        elif pdtypes.is_object_dtype(series):
1812            coldata = series.fillna('').values
1813            assert isinstance(coldata, np.ndarray)
1814            orangecol = coldata
1815            var = Orange.data.StringVariable.make(str(header))
1816        elif pdtypes.is_integer_dtype(series):
1817            coldata = series.values
1818            var = Orange.data.ContinuousVariable.make(str(header))
1819            var.number_of_decimals = 0
1820            orangecol = coldata.astype(np.float64)
1821        elif pdtypes.is_numeric_dtype(series):
1822            orangecol = series.values.astype(np.float64)
1823            var = Orange.data.ContinuousVariable.make(str(header))
1824        else:
1825            warnings.warn(
1826                "Column '{}' with dtype: {} skipped."
1827                .format(header, series.dtype),
1828                UserWarning
1829            )
1830            continue
1831        columns.append((var, orangecol))
1832
1833    cols_x = [(var, col) for var, col in columns if var.is_primitive()]
1834    cols_m = [(var, col) for var, col in columns if not var.is_primitive()]
1835
1836    variables = [v for v, _ in cols_x]
1837    if cols_x:
1838        X = np.column_stack([a for _, a in cols_x])
1839    else:
1840        X = np.empty((df.shape[0], 0), dtype=np.float)
1841    metas = [v for v, _ in cols_m]
1842    if cols_m:
1843        M = np.column_stack([a for _, a in cols_m])
1844    else:
1845        M = None
1846
1847    domain = Orange.data.Domain(variables, metas=metas)
1848    return Orange.data.Table.from_numpy(domain, X, None, M)
1849
1850
1851def main(argv=None):  # pragma: no cover
1852    app = QApplication(argv or [])
1853    w = OWCSVFileImport()
1854    w.show()
1855    w.raise_()
1856    app.exec()
1857    w.saveSettings()
1858    w.onDeleteWidget()
1859    return 0
1860
1861
1862if __name__ == "__main__":  # pragma: no cover
1863    sys.exit(main(sys.argv))
1864