1# -*- coding: utf-8 -*- 2""" 3CSV File Import Widget 4---------------------- 5 6""" 7import sys 8import types 9import os 10import csv 11import enum 12import io 13import traceback 14import warnings 15import logging 16import weakref 17import json 18 19import gzip 20import lzma 21import bz2 22import zipfile 23 24from xml.sax.saxutils import escape 25from functools import singledispatch 26from contextlib import ExitStack 27 28import typing 29from typing import ( 30 List, Tuple, Dict, Optional, Any, Callable, Iterable, 31 Union, AnyStr, BinaryIO, Set, Type, Mapping, Sequence, NamedTuple 32) 33 34from AnyQt.QtCore import ( 35 Qt, QFileInfo, QTimer, QSettings, QObject, QSize, QMimeDatabase, QMimeType 36) 37from AnyQt.QtGui import ( 38 QStandardItem, QStandardItemModel, QPalette, QColor, QIcon 39) 40from AnyQt.QtWidgets import ( 41 QLabel, QComboBox, QPushButton, QDialog, QDialogButtonBox, QGridLayout, 42 QVBoxLayout, QSizePolicy, QStyle, QFileIconProvider, QFileDialog, 43 QApplication, QMessageBox, QTextBrowser, QMenu 44) 45from AnyQt.QtCore import pyqtSlot as Slot, pyqtSignal as Signal 46 47import numpy as np 48import pandas.errors 49import pandas as pd 50 51from pandas.api import types as pdtypes 52 53import Orange.data 54from Orange.misc.collections import natural_sorted 55 56from Orange.widgets import widget, gui, settings 57from Orange.widgets.utils.concurrent import PyOwned 58from Orange.widgets.utils import ( 59 textimport, concurrent as qconcurrent, unique_everseen, enum_get, qname 60) 61from Orange.widgets.utils.combobox import ItemStyledComboBox 62from Orange.widgets.utils.pathutils import ( 63 PathItem, VarPath, AbsPath, samepath, prettyfypath, isprefixed, 64) 65from Orange.widgets.utils.overlay import OverlayWidget 66from Orange.widgets.utils.settings import ( 67 QSettings_readArray, QSettings_writeArray 68) 69 70if typing.TYPE_CHECKING: 71 # pylint: disable=invalid-name 72 T = typing.TypeVar("T") 73 K = typing.TypeVar("K") 74 E = typing.TypeVar("E", bound=enum.Enum) 75 76__all__ = ["OWCSVFileImport"] 77 78_log = logging.getLogger(__name__) 79 80ColumnType = textimport.ColumnType 81RowSpec = textimport.RowSpec 82 83 84def dialect_eq(lhs, rhs): 85 # type: (csv.Dialect, csv.Dialect) -> bool 86 """Compare 2 `csv.Dialect` instances for equality.""" 87 return (lhs.delimiter == rhs.delimiter and 88 lhs.quotechar == rhs.quotechar and 89 lhs.doublequote == rhs.doublequote and 90 lhs.escapechar == rhs.escapechar and 91 lhs.quoting == rhs.quoting and 92 lhs.skipinitialspace == rhs.skipinitialspace) 93 94 95class Options: 96 """ 97 Stored options for loading CSV-like file. 98 99 Arguments 100 --------- 101 encoding : str 102 A encoding to use for reading. 103 dialect : csv.Dialect 104 A csv.Dialect instance. 105 columntypes: Iterable[Tuple[range, ColumnType]] 106 A list of column type ranges specifying the types for columns. 107 Need not list all columns. Columns not listed are assumed to have auto 108 type inference. 109 rowspec : Iterable[Tuple[range, RowSpec]] 110 A list of row spec ranges. 111 decimal_separator : str 112 Decimal separator - a single character string; default: `"."` 113 group_separator : str 114 Thousands group separator - empty or a single character string; 115 default: empty string 116 """ 117 RowSpec = RowSpec 118 ColumnType = ColumnType 119 120 def __init__(self, encoding='utf-8', dialect=csv.excel(), 121 columntypes: Iterable[Tuple[range, 'ColumnType']] = (), 122 rowspec=((range(0, 1), RowSpec.Header),), 123 decimal_separator=".", group_separator="") -> None: 124 self.encoding = encoding 125 self.dialect = dialect 126 self.columntypes = list(columntypes) # type: List[Tuple[range, ColumnType]] 127 self.rowspec = list(rowspec) # type: List[Tuple[range, RowSpec]] 128 self.decimal_separator = decimal_separator 129 self.group_separator = group_separator 130 131 def __eq__(self, other): 132 """ 133 Compare this instance to `other` for equality. 134 """ 135 if isinstance(other, Options): 136 return (dialect_eq(self.dialect, other.dialect) and 137 self.encoding == other.encoding and 138 self.columntypes == other.columntypes and 139 self.rowspec == other.rowspec and 140 self.group_separator == other.group_separator and 141 self.decimal_separator == other.decimal_separator) 142 else: 143 return NotImplemented 144 145 def __repr__(self): 146 class_, args = self.__reduce__() 147 return "{}{!r}".format(class_.__name__, args) 148 __str__ = __repr__ 149 150 def __reduce__(self): 151 return type(self), (self.encoding, self.dialect, 152 self.columntypes, self.rowspec) 153 154 def as_dict(self): 155 # type: () -> Dict[str, Any] 156 """ 157 Return return Option parameters as plain types suitable for 158 serialization (e.g JSON serializable). 159 """ 160 return { 161 "encoding": self.encoding, 162 "delimiter": self.dialect.delimiter, 163 "quotechar": self.dialect.quotechar, 164 "doublequote": self.dialect.doublequote, 165 "skipinitialspace": self.dialect.skipinitialspace, 166 "quoting": self.dialect.quoting, 167 "columntypes": Options.spec_as_encodable(self.columntypes), 168 "rowspec": Options.spec_as_encodable(self.rowspec), 169 "decimal_separator": self.decimal_separator, 170 "group_separator": self.group_separator, 171 } 172 173 @staticmethod 174 def from_dict(mapping): 175 # type: (Dict[str, Any]) -> Options 176 """ 177 Reconstruct a `Options` from a plain dictionary (see :func:`as_dict`). 178 """ 179 encoding = mapping["encoding"] 180 delimiter = mapping["delimiter"] 181 quotechar = mapping["quotechar"] 182 doublequote = mapping["doublequote"] 183 quoting = mapping["quoting"] 184 skipinitialspace = mapping["skipinitialspace"] 185 186 dialect = textimport.Dialect( 187 delimiter, quotechar, None, doublequote, skipinitialspace, 188 quoting=quoting) 189 190 colspec = mapping["columntypes"] 191 rowspec = mapping["rowspec"] 192 colspec = Options.spec_from_encodable(colspec, ColumnType) 193 rowspec = Options.spec_from_encodable(rowspec, RowSpec) 194 decimal = mapping.get("decimal_separator", ".") 195 group = mapping.get("group_separator", "") 196 197 return Options(encoding, dialect, colspec, rowspec, 198 decimal_separator=decimal, 199 group_separator=group) 200 201 @staticmethod 202 def spec_as_encodable(spec): 203 # type: (Iterable[Tuple[range, enum.Enum]]) -> List[Dict[str, Any]] 204 return [{"start": r.start, "stop": r.stop, "value": value.name} 205 for r, value in spec] 206 207 @staticmethod 208 def spec_from_encodable(spec, enumtype): 209 # type: (Iterable[Dict[str, Any]], Type[E]) -> List[Tuple[range, E]] 210 r = [] 211 for v in spec: 212 try: 213 start, stop, name = v["start"], v["stop"], v["value"] 214 except (KeyError, ValueError): 215 pass 216 else: 217 r.append((range(start, stop), enum_get(enumtype, name, None))) 218 return r 219 220 221class CSVImportDialog(QDialog): 222 """ 223 A dialog for selecting CSV file import options. 224 """ 225 def __init__(self, parent=None, flags=Qt.Dialog, **kwargs): 226 super().__init__(parent, flags, **kwargs) 227 self.setLayout(QVBoxLayout()) 228 229 self._options = None 230 self._path = None 231 # Finalizer for opened file handle (in _update_preview) 232 self.__finalizer = None # type: Optional[Callable[[], None]] 233 self._optionswidget = textimport.CSVImportWidget() 234 self._optionswidget.previewReadErrorOccurred.connect( 235 self.__on_preview_error 236 ) 237 self._optionswidget.previewModelReset.connect( 238 self.__on_preview_reset 239 ) 240 self._buttons = buttons = QDialogButtonBox( 241 orientation=Qt.Horizontal, 242 standardButtons=(QDialogButtonBox.Ok | QDialogButtonBox.Cancel | 243 QDialogButtonBox.Reset | 244 QDialogButtonBox.RestoreDefaults), 245 objectName="dialog-button-box", 246 ) 247 # TODO: Help button 248 buttons.accepted.connect(self.accept) 249 buttons.rejected.connect(self.reject) 250 251 b = buttons.button(QDialogButtonBox.Reset) 252 b.clicked.connect(self.reset) 253 b = buttons.button(QDialogButtonBox.RestoreDefaults) 254 b.clicked.connect(self.restoreDefaults) 255 self.layout().addWidget(self._optionswidget) 256 self.layout().addWidget(buttons) 257 self.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding) 258 259 self._overlay = OverlayWidget(self) 260 self._overlay.setWidget(self._optionswidget.dataview) 261 self._overlay.setLayout(QVBoxLayout()) 262 self._overlay.layout().addWidget(QLabel(wordWrap=True)) 263 self._overlay.hide() 264 265 def setOptions(self, options): 266 # type: (Options) -> None 267 self._options = options 268 self._optionswidget.setEncoding(options.encoding) 269 self._optionswidget.setDialect(options.dialect) 270 self._optionswidget.setNumbersFormat( 271 options.group_separator, options.decimal_separator) 272 self._optionswidget.setColumnTypeRanges(options.columntypes) 273 self._optionswidget.setRowStates( 274 {i: v for r, v in options.rowspec for i in r} 275 ) 276 277 def options(self): 278 # type: () -> Options 279 rowspec_ = self._optionswidget.rowStates() 280 rowspec = [(range(i, i + 1), v) for i, v in rowspec_.items()] 281 numformat = self._optionswidget.numbersFormat() 282 return Options( 283 encoding=self._optionswidget.encoding(), 284 dialect=self._optionswidget.dialect(), 285 columntypes=self._optionswidget.columnTypeRanges(), 286 rowspec=rowspec, 287 decimal_separator=numformat["decimal"], 288 group_separator=numformat["group"], 289 ) 290 291 def setPath(self, path): 292 """ 293 Set the preview path. 294 """ 295 if self._path != path: 296 self._path = path 297 self.__update_preview() 298 299 def path(self): 300 """Return the preview path""" 301 return self._path 302 303 def reset(self): 304 """ 305 Reset the options to the state previously set with `setOptions` 306 effectively undoing any user modifications since then. 307 """ 308 self.setOptions(self._options) 309 310 def restoreDefaults(self): 311 """ 312 Restore the options to default state. 313 """ 314 # preserve `_options` if set by clients (for `reset`). 315 opts = self._options 316 self.setOptions(Options("utf-8", csv.excel())) 317 self._options = opts 318 319 def __update_preview(self): 320 if not self._path: 321 return 322 try: 323 f = _open(self._path, "rb") 324 except OSError as err: 325 traceback.print_exc(file=sys.stderr) 326 fmt = "".join(traceback.format_exception_only(type(err), err)) 327 self.__set_error(fmt) 328 else: 329 self.__clear_error() 330 self._optionswidget.setSampleContents(f) 331 closeexisting = self.__finalizer 332 if closeexisting is not None: 333 self.destroyed.disconnect(closeexisting) 334 closeexisting() 335 self.__finalizer = weakref.finalize(self, f.close) 336 self.destroyed.connect(self.__finalizer) 337 338 def __set_error(self, text, format=Qt.PlainText): 339 self._optionswidget.setEnabled(False) 340 label = self._overlay.findChild(QLabel) # type: QLabel 341 label.setText(text) 342 label.setTextFormat(format) 343 self._overlay.show() 344 self._overlay.raise_() 345 dialog_button_box_set_enabled(self._buttons, False) 346 347 def __clear_error(self): 348 if self._overlay.isVisibleTo(self): 349 self._overlay.hide() 350 self._optionswidget.setEnabled(True) 351 352 # Enable/disable the accept buttons on the most egregious errors. 353 def __on_preview_error(self): 354 b = self._buttons.button(QDialogButtonBox.Ok) 355 b.setEnabled(False) 356 357 def __on_preview_reset(self): 358 b = self._buttons.button(QDialogButtonBox.Ok) 359 b.setEnabled(True) 360 361 362def dialog_button_box_set_enabled(buttonbox, enabled): 363 # type: (QDialogButtonBox, bool) -> None 364 """ 365 Disable/enable buttons in a QDialogButtonBox based on their role. 366 367 All buttons except the ones with RejectRole or HelpRole are disabled. 368 """ 369 stashname = "__p_dialog_button_box_set_enabled" 370 for b in buttonbox.buttons(): 371 role = buttonbox.buttonRole(b) 372 if not enabled: 373 if b.property(stashname) is None: 374 b.setProperty(stashname, b.isEnabledTo(buttonbox)) 375 b.setEnabled( 376 role == QDialogButtonBox.RejectRole or 377 role == QDialogButtonBox.HelpRole 378 ) 379 else: 380 stashed_state = b.property(stashname) 381 if isinstance(stashed_state, bool): 382 state = stashed_state 383 b.setProperty(stashname, None) 384 else: 385 state = True 386 b.setEnabled(state) 387 388 389def icon_for_path(path: str) -> QIcon: 390 iconprovider = QFileIconProvider() 391 finfo = QFileInfo(path) 392 if finfo.exists(): 393 return iconprovider.icon(finfo) 394 else: 395 return iconprovider.icon(QFileIconProvider.File) 396 397 398class VarPathItem(QStandardItem): 399 PathRole = Qt.UserRole + 4502 400 VarPathRole = PathRole + 1 401 402 def path(self) -> str: 403 """Return the resolved path or '' if unresolved or missing""" 404 path = self.data(VarPathItem.PathRole) 405 return path if isinstance(path, str) else "" 406 407 def setPath(self, path: str) -> None: 408 """Set absolute path.""" 409 self.setData(PathItem.AbsPath(path), VarPathItem.VarPathRole) 410 411 def varPath(self) -> Optional[PathItem]: 412 vpath = self.data(VarPathItem.VarPathRole) 413 return vpath if isinstance(vpath, PathItem) else None 414 415 def setVarPath(self, vpath: PathItem) -> None: 416 """Set variable path item.""" 417 self.setData(vpath, VarPathItem.VarPathRole) 418 419 def resolve(self, vpath: PathItem) -> Optional[str]: 420 """ 421 Resolve `vpath` item. This implementation dispatches to parent model's 422 (:func:`VarPathItemModel.resolve`) 423 """ 424 model = self.model() 425 if isinstance(model, VarPathItemModel): 426 return model.resolve(vpath) 427 else: 428 return vpath.resolve({}) 429 430 def data(self, role=Qt.UserRole + 1) -> Any: 431 if role == Qt.DisplayRole: 432 value = super().data(role) 433 if value is not None: 434 return value 435 vpath = self.varPath() 436 if isinstance(vpath, PathItem.AbsPath): 437 return os.path.basename(vpath.path) 438 elif isinstance(vpath, PathItem.VarPath): 439 return os.path.basename(vpath.relpath) 440 else: 441 return None 442 elif role == Qt.DecorationRole: 443 return icon_for_path(self.path()) 444 elif role == VarPathItem.PathRole: 445 vpath = self.data(VarPathItem.VarPathRole) 446 if isinstance(vpath, PathItem.AbsPath): 447 return vpath.path 448 elif isinstance(vpath, VarPath): 449 path = self.resolve(vpath) 450 if path is not None: 451 return path 452 return super().data(role) 453 elif role == Qt.ToolTipRole: 454 vpath = self.data(VarPathItem.VarPathRole) 455 if isinstance(vpath, VarPath.AbsPath): 456 return vpath.path 457 elif isinstance(vpath, VarPath): 458 text = f"${{{vpath.name}}}/{vpath.relpath}" 459 p = self.resolve(vpath) 460 if p is None or not os.path.exists(p): 461 text += " (missing)" 462 return text 463 elif role == Qt.ForegroundRole: 464 vpath = self.data(VarPathItem.VarPathRole) 465 if isinstance(vpath, PathItem): 466 p = self.resolve(vpath) 467 if p is None or not os.path.exists(p): 468 return QColor(Qt.red) 469 return super().data(role) 470 471 472class ImportItem(VarPathItem): 473 """ 474 An item representing a file path and associated load options 475 """ 476 OptionsRole = Qt.UserRole + 14 477 IsSessionItemRole = Qt.UserRole + 15 478 479 def options(self) -> Optional[Options]: 480 options = self.data(ImportItem.OptionsRole) 481 return options if isinstance(options, Options) else None 482 483 def setOptions(self, options: Options) -> None: 484 self.setData(options, ImportItem.OptionsRole) 485 486 def setIsSessionItem(self, issession: bool) -> None: 487 self.setData(issession, ImportItem.IsSessionItemRole) 488 489 def isSessionItem(self) -> bool: 490 return bool(self.data(ImportItem.IsSessionItemRole)) 491 492 @classmethod 493 def fromPath(cls, path: Union[str, PathItem]) -> 'ImportItem': 494 """ 495 Create a `ImportItem` from a local file system path. 496 """ 497 if isinstance(path, str): 498 path = PathItem.AbsPath(path) 499 if isinstance(path, PathItem.VarPath): 500 basename = os.path.basename(path.relpath) 501 text = f"${{{path.name}}}/{path.relpath}" 502 elif isinstance(path, PathItem.AbsPath): 503 basename = os.path.basename(path.path) 504 text = path.path 505 else: 506 raise TypeError 507 508 item = cls() 509 item.setText(basename) 510 item.setToolTip(text) 511 item.setData(path, ImportItem.VarPathRole) 512 return item 513 514 515class VarPathItemModel(QStandardItemModel): 516 def __init__(self, *args, replacementEnv=types.MappingProxyType({}), 517 **kwargs): 518 self.__replacements = types.MappingProxyType(dict(replacementEnv)) 519 super().__init__(*args, **kwargs) 520 521 def setReplacementEnv(self, env: Mapping[str, str]) -> None: 522 self.__replacements = types.MappingProxyType(dict(env)) 523 self.dataChanged.emit( 524 self.index(0, 0), 525 self.index(self.rowCount() - 1, self.columnCount() - 1) 526 ) 527 528 def replacementEnv(self) -> Mapping[str, str]: 529 return self.__replacements 530 531 def resolve(self, vpath: PathItem) -> Optional[str]: 532 return vpath.resolve(self.replacementEnv()) 533 534 535def move_item_to_index(model: QStandardItemModel, item: QStandardItem, index: int): 536 if item.row() == index: 537 return 538 assert item.model() is model 539 [item_] = model.takeRow(item.row()) 540 assert item_ is item 541 model.insertRow(index, [item]) 542 543 544class FileFormat(NamedTuple): 545 mime_type: str 546 name: str 547 globs: Sequence[str] 548 549 550FileFormats = [ 551 FileFormat("text/csv", "Text - comma separated", ("*.csv", "*")), 552 FileFormat("text/tab-separated-values", "Text - tab separated", ("*.tsv", "*")), 553 FileFormat("text/plain", "Text - all files", ("*.txt", "*")), 554] 555 556 557class FileDialog(QFileDialog): 558 __formats: Sequence[FileFormat] = () 559 560 @staticmethod 561 def filterStr(f: FileFormat) -> str: 562 return f"{f.name} ({', '.join(f.globs)})" 563 564 def setFileFormats(self, formats: Sequence[FileFormat]): 565 filters = [FileDialog.filterStr(f) for f in formats] 566 self.__formats = tuple(formats) 567 self.setNameFilters(filters) 568 569 def fileFormats(self) -> Sequence[FileFormat]: 570 return self.__formats 571 572 def selectedFileFormat(self) -> FileFormat: 573 filter_ = self.selectedNameFilter() 574 index = index_where( 575 self.__formats, lambda f: FileDialog.filterStr(f) == filter_ 576 ) 577 return self.__formats[index] 578 579 580def default_options_for_mime_type( 581 path: str, mime_type: str 582) -> Options: 583 defaults = { 584 "text/csv": (csv.excel(), True), 585 "text/tab-separated-values": (csv.excel_tab(), True) 586 } 587 dialect, header, encoding = csv.excel(), True, "utf-8" 588 delimiters = None 589 try_encodings = ["utf-8", "utf-16", "iso8859-1"] 590 if mime_type in defaults: 591 dialect, header = defaults[mime_type] 592 delimiters = [dialect.delimiter] 593 594 for encoding_ in try_encodings: 595 try: 596 dialect, header = sniff_csv_with_path( 597 path, encoding=encoding_, delimiters=delimiters) 598 encoding = encoding_ 599 except (OSError, UnicodeError, csv.Error): 600 pass 601 else: 602 break 603 if header: 604 rowspec = [(range(0, 1), RowSpec.Header)] 605 else: 606 rowspec = [] 607 return Options(dialect=dialect, encoding=encoding, rowspec=rowspec) 608 609 610class OWCSVFileImport(widget.OWWidget): 611 name = "CSV File Import" 612 description = "Import a data table from a CSV formatted file." 613 icon = "icons/CSVFile.svg" 614 priority = 11 615 category = "Data" 616 keywords = ["file", "load", "read", "open", "csv"] 617 618 class Outputs: 619 data = widget.Output( 620 name="Data", 621 type=Orange.data.Table, 622 doc="Loaded data set.") 623 data_frame = widget.Output( 624 name="Data Frame", 625 type=pd.DataFrame, 626 doc="", 627 auto_summary=False 628 ) 629 630 class Error(widget.OWWidget.Error): 631 error = widget.Msg( 632 "Unexpected error" 633 ) 634 encoding_error = widget.Msg( 635 "Encoding error\n" 636 "The file might be encoded in an unsupported encoding or it " 637 "might be binary" 638 ) 639 640 #: Paths and options of files accessed in a 'session' 641 _session_items = settings.Setting( 642 [], schema_only=True) # type: List[Tuple[str, dict]] 643 644 _session_items_v2 = settings.Setting( 645 [], schema_only=True) # type: List[Tuple[Dict[str, str], dict]] 646 #: Saved dialog state (last directory and selected filter) 647 dialog_state = settings.Setting({ 648 "directory": "", 649 "filter": "" 650 }) # type: Dict[str, str] 651 652 # we added column type guessing to this widget, which breaks compatibility 653 # with older saved workflows, where types not guessed differently, when 654 # compatibility_mode=True widget have older guessing behaviour 655 settings_version = 3 656 compatibility_mode = settings.Setting(False, schema_only=True) 657 658 MaxHistorySize = 50 659 660 want_main_area = False 661 buttons_area_orientation = None 662 resizing_enabled = False 663 664 def __init__(self, *args, **kwargs): 665 super().__init__(self, *args, **kwargs) 666 self.settingsAboutToBePacked.connect(self._saveState) 667 668 self.__committimer = QTimer(self, singleShot=True) 669 self.__committimer.timeout.connect(self.commit) 670 671 self.__executor = qconcurrent.ThreadExecutor() 672 self.__watcher = None # type: Optional[qconcurrent.FutureWatcher] 673 674 self.controlArea.layout().setSpacing(-1) # reset spacing 675 grid = QGridLayout() 676 grid.addWidget(QLabel("File:", self), 0, 0, 1, 1) 677 678 self.import_items_model = VarPathItemModel(self) 679 self.import_items_model.setReplacementEnv(self._replacements()) 680 self.recent_combo = ItemStyledComboBox( 681 self, objectName="recent-combo", toolTip="Recent files.", 682 sizeAdjustPolicy=QComboBox.AdjustToMinimumContentsLengthWithIcon, 683 minimumContentsLength=16, placeholderText="Recent files…" 684 ) 685 self.recent_combo.setModel(self.import_items_model) 686 self.recent_combo.activated.connect(self.activate_recent) 687 self.recent_combo.setSizePolicy( 688 QSizePolicy.MinimumExpanding, QSizePolicy.Fixed) 689 self.browse_button = QPushButton( 690 "…", icon=self.style().standardIcon(QStyle.SP_DirOpenIcon), 691 toolTip="Browse filesystem", autoDefault=False, 692 ) 693 # A button drop down menu with selection of explicit workflow dir 694 # relative import. This is only enabled when 'basedir' workflow env 695 # is set. XXX: Always use menu, disable Import relative... action? 696 self.browse_menu = menu = QMenu(self.browse_button) 697 ac = menu.addAction("Import any file…") 698 ac.triggered.connect(self.browse) 699 700 ac = menu.addAction("Import relative to workflow file…") 701 ac.setToolTip("Import a file within the workflow file directory") 702 ac.triggered.connect(lambda: self.browse_relative("basedir")) 703 704 if "basedir" in self._replacements(): 705 self.browse_button.setMenu(menu) 706 707 self.browse_button.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) 708 self.browse_button.clicked.connect(self.browse) 709 grid.addWidget(self.recent_combo, 0, 1, 1, 1) 710 grid.addWidget(self.browse_button, 0, 2, 1, 1) 711 self.controlArea.layout().addLayout(grid) 712 713 ########### 714 # Info text 715 ########### 716 box = gui.widgetBox(self.controlArea, "Info") 717 self.summary_text = QTextBrowser( 718 verticalScrollBarPolicy=Qt.ScrollBarAsNeeded, 719 readOnly=True, 720 ) 721 self.summary_text.viewport().setBackgroundRole(QPalette.NoRole) 722 self.summary_text.setFrameStyle(QTextBrowser.NoFrame) 723 self.summary_text.setMinimumHeight(self.fontMetrics().ascent() * 2 + 4) 724 self.summary_text.viewport().setAutoFillBackground(False) 725 box.layout().addWidget(self.summary_text) 726 727 button_box = QDialogButtonBox( 728 orientation=Qt.Horizontal, 729 standardButtons=QDialogButtonBox.Cancel | QDialogButtonBox.Retry 730 ) 731 self.load_button = b = button_box.button(QDialogButtonBox.Retry) 732 b.setText("Load") 733 b.clicked.connect(self.__committimer.start) 734 b.setEnabled(False) 735 b.setDefault(True) 736 737 self.cancel_button = b = button_box.button(QDialogButtonBox.Cancel) 738 b.clicked.connect(self.cancel) 739 b.setEnabled(False) 740 b.setAutoDefault(False) 741 742 self.import_options_button = QPushButton( 743 "Import Options…", enabled=False, autoDefault=False, 744 clicked=self._activate_import_dialog 745 ) 746 747 def update_buttons(cbindex): 748 self.import_options_button.setEnabled(cbindex != -1) 749 self.load_button.setEnabled(cbindex != -1) 750 self.recent_combo.currentIndexChanged.connect(update_buttons) 751 752 button_box.addButton( 753 self.import_options_button, QDialogButtonBox.ActionRole 754 ) 755 button_box.setStyleSheet( 756 "button-layout: {:d};".format(QDialogButtonBox.MacLayout) 757 ) 758 self.controlArea.layout().addWidget(button_box) 759 self.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Maximum) 760 761 self._restoreState() 762 item = self.current_item() 763 if item is not None: 764 self._invalidate() 765 766 def workflowEnvChanged(self, key, value, oldvalue): 767 super().workflowEnvChanged(key, value, oldvalue) 768 if key == "basedir": 769 self.browse_button.setMenu(self.browse_menu) 770 self.import_items_model.setReplacementEnv(self._replacements()) 771 772 @Slot(int) 773 def activate_recent(self, index): 774 """ 775 Activate an item from the recent list. 776 """ 777 model = self.import_items_model 778 cb = self.recent_combo 779 if 0 <= index < model.rowCount(): 780 item = model.item(index) 781 assert isinstance(item, ImportItem) 782 path = item.path() 783 item.setData(True, ImportItem.IsSessionItemRole) 784 move_item_to_index(model, item, 0) 785 if not os.path.exists(path): 786 self._browse_for_missing( 787 item, onfinished=lambda status: self._invalidate() 788 ) 789 else: 790 cb.setCurrentIndex(0) 791 self._invalidate() 792 else: 793 self.recent_combo.setCurrentIndex(-1) 794 795 def _browse_for_missing( 796 self, item: ImportItem, *, onfinished: Optional[Callable[[int], Any]] = None): 797 dlg = self._browse_dialog() 798 model = self.import_items_model 799 800 if onfinished is None: 801 onfinished = lambda status: None 802 803 vpath = item.varPath() 804 prefixpath = None 805 if isinstance(vpath, PathItem.VarPath): 806 prefixpath = self._replacements().get(vpath.name) 807 if prefixpath is not None: 808 dlg.setDirectory(prefixpath) 809 dlg.setAttribute(Qt.WA_DeleteOnClose) 810 811 def accepted(): 812 path = dlg.selectedFiles()[0] 813 if isinstance(vpath, VarPath) and not isprefixed(prefixpath, path): 814 mb = self._path_must_be_relative_mb(prefixpath) 815 mb.show() 816 mb.finished.connect(lambda _: onfinished(QDialog.Rejected)) 817 return 818 819 # pre-flight check; try to determine the nature of the file 820 mtype = _mime_type_for_path(path) 821 if not mtype.inherits("text/plain"): 822 mb = self._might_be_binary_mb(path) 823 if mb.exec() == QMessageBox.Cancel: 824 if onfinished: 825 onfinished(QDialog.Rejected) 826 return 827 828 if isinstance(vpath, VarPath): 829 vpath_ = VarPath(vpath.name, os.path.relpath(path, prefixpath)) 830 else: 831 vpath_ = AbsPath(path) 832 item.setVarPath(vpath_) 833 if item.row() != 0: 834 move_item_to_index(model, item, 0) 835 item.setData(True, ImportItem.IsSessionItemRole) 836 self.set_selected_file(path, item.options()) 837 self._note_recent(path, item.options()) 838 onfinished(QDialog.Accepted) 839 840 dlg.accepted.connect(accepted) 841 dlg.open() 842 843 def _browse_dialog(self): 844 dlg = FileDialog( 845 self, windowTitle=self.tr("Open Data File"), 846 acceptMode=QFileDialog.AcceptOpen, 847 fileMode=QFileDialog.ExistingFile 848 ) 849 850 dlg.setFileFormats(FileFormats) 851 state = self.dialog_state 852 lastdir = state.get("directory", "") 853 lastfilter = state.get("filter", "") 854 if lastdir and os.path.isdir(lastdir): 855 dlg.setDirectory(lastdir) 856 if lastfilter: 857 dlg.selectNameFilter(lastfilter) 858 859 def store_state(): 860 state["directory"] = dlg.directory().absolutePath() 861 state["filter"] = dlg.selectedNameFilter() 862 dlg.accepted.connect(store_state) 863 return dlg 864 865 def _might_be_binary_mb(self, path) -> QMessageBox: 866 mb = QMessageBox( 867 parent=self, 868 windowTitle=self.tr(""), 869 icon=QMessageBox.Question, 870 text=self.tr("The '{basename}' may be a binary file.\n" 871 "Are you sure you want to continue?").format( 872 basename=os.path.basename(path)), 873 standardButtons=QMessageBox.Cancel | QMessageBox.Yes 874 ) 875 mb.setWindowModality(Qt.WindowModal) 876 return mb 877 878 def _path_must_be_relative_mb(self, prefix: str) -> QMessageBox: 879 mb = QMessageBox( 880 parent=self, windowTitle=self.tr("Invalid path"), 881 icon=QMessageBox.Warning, 882 text=self.tr("Selected path is not within '{prefix}'").format( 883 prefix=prefix 884 ), 885 ) 886 mb.setAttribute(Qt.WA_DeleteOnClose) 887 return mb 888 889 @Slot(str) 890 def browse_relative(self, prefixname): 891 path = self._replacements().get(prefixname) 892 self.browse(prefixname=prefixname, directory=path) 893 894 @Slot() 895 def browse(self, prefixname=None, directory=None): 896 """ 897 Open a file dialog and select a user specified file. 898 """ 899 dlg = self._browse_dialog() 900 if directory is not None: 901 dlg.setDirectory(directory) 902 903 status = dlg.exec() 904 dlg.deleteLater() 905 if status == QFileDialog.Accepted: 906 selected_filter = dlg.selectedFileFormat() 907 path = dlg.selectedFiles()[0] 908 if prefixname: 909 _prefixpath = self._replacements().get(prefixname, "") 910 if not isprefixed(_prefixpath, path): 911 mb = self._path_must_be_relative_mb(_prefixpath) 912 mb.show() 913 return 914 varpath = VarPath(prefixname, os.path.relpath(path, _prefixpath)) 915 else: 916 varpath = PathItem.AbsPath(path) 917 918 # pre-flight check; try to determine the nature of the file 919 mtype = _mime_type_for_path(path) 920 if not mtype.inherits("text/plain"): 921 mb = self._might_be_binary_mb(path) 922 if mb.exec() == QMessageBox.Cancel: 923 return 924 # initialize options based on selected format 925 options = default_options_for_mime_type( 926 path, selected_filter.mime_type, 927 ) 928 # Search for path in history. 929 # If found use the stored params to initialize the import dialog 930 items = self.itemsFromSettings() 931 idx = index_where(items, lambda t: samepath(t[0], path)) 932 if idx is not None: 933 _, options_ = items[idx] 934 if options_ is not None: 935 options = options_ 936 dlg = CSVImportDialog( 937 self, windowTitle="Import Options", sizeGripEnabled=True) 938 dlg.setWindowModality(Qt.WindowModal) 939 dlg.setPath(path) 940 dlg.setOptions(options) 941 status = dlg.exec() 942 dlg.deleteLater() 943 if status == QDialog.Accepted: 944 self.set_selected_file(path, dlg.options()) 945 self.current_item().setVarPath(varpath) 946 947 def current_item(self): 948 # type: () -> Optional[ImportItem] 949 """ 950 Return the current selected item (file) or None if there is no 951 current item. 952 """ 953 idx = self.recent_combo.currentIndex() 954 if idx == -1: 955 return None 956 957 item = self.recent_combo.model().item(idx) # type: QStandardItem 958 if isinstance(item, ImportItem): 959 return item 960 else: 961 return None 962 963 def _activate_import_dialog(self): 964 """Activate the Import Options dialog for the current item.""" 965 item = self.current_item() 966 assert item is not None 967 dlg = CSVImportDialog( 968 self, windowTitle="Import Options", sizeGripEnabled=True, 969 ) 970 dlg.setWindowModality(Qt.WindowModal) 971 dlg.setAttribute(Qt.WA_DeleteOnClose) 972 settings = self._local_settings() 973 settings.beginGroup(qname(type(dlg))) 974 size = settings.value("size", QSize(), type=QSize) # type: QSize 975 if size.isValid(): 976 dlg.resize(size) 977 978 path = item.data(ImportItem.PathRole) 979 options = item.data(ImportItem.OptionsRole) 980 dlg.setPath(path) # Set path before options so column types can 981 if isinstance(options, Options): 982 dlg.setOptions(options) 983 984 def update(): 985 newoptions = dlg.options() 986 item.setData(newoptions, ImportItem.OptionsRole) 987 # update local recent paths list 988 self._note_recent(path, newoptions) 989 if newoptions != options: 990 self._invalidate() 991 dlg.accepted.connect(update) 992 993 def store_size(): 994 settings.setValue("size", dlg.size()) 995 dlg.finished.connect(store_size) 996 dlg.show() 997 998 def set_selected_file(self, filename, options=None): 999 """ 1000 Set the current selected filename path. 1001 """ 1002 self._add_recent(filename, options) 1003 self._invalidate() 1004 1005 #: Saved options for a filename 1006 SCHEMA = { 1007 "path": str, # Local filesystem path 1008 "options": str, # json encoded 'Options' 1009 } 1010 1011 @classmethod 1012 def _local_settings(cls): 1013 # type: () -> QSettings 1014 """Return a QSettings instance with local persistent settings.""" 1015 filename = "{}.ini".format(qname(cls)) 1016 fname = os.path.join(settings.widget_settings_dir(), filename) 1017 return QSettings(fname, QSettings.IniFormat) 1018 1019 def _add_recent(self, filename, options=None): 1020 # type: (str, Optional[Options]) -> None 1021 """ 1022 Add filename to the list of recent files. 1023 """ 1024 model = self.import_items_model 1025 index = index_where( 1026 (model.index(i, 0).data(ImportItem.PathRole) 1027 for i in range(model.rowCount())), 1028 lambda path: isinstance(path, str) and samepath(path, filename) 1029 ) 1030 if index is not None: 1031 item, *_ = model.takeRow(index) 1032 else: 1033 item = ImportItem.fromPath(filename) 1034 1035 # item.setData(VarPath(filename), ImportItem.VarPathRole) 1036 item.setData(True, ImportItem.IsSessionItemRole) 1037 model.insertRow(0, item) 1038 1039 if options is not None: 1040 item.setOptions(options) 1041 1042 self.recent_combo.setCurrentIndex(0) 1043 1044 if not os.path.exists(filename): 1045 return 1046 self._note_recent(filename, options) 1047 1048 def _note_recent(self, filename, options): 1049 # store item to local persistent settings 1050 s = self._local_settings() 1051 arr = QSettings_readArray(s, "recent", OWCSVFileImport.SCHEMA) 1052 item = {"path": filename} 1053 if options is not None: 1054 item["options"] = json.dumps(options.as_dict()) 1055 arr = [item for item in arr if not samepath(item.get("path"), filename)] 1056 arr.append(item) 1057 QSettings_writeArray(s, "recent", arr) 1058 1059 def _invalidate(self): 1060 # Invalidate the current output and schedule a new commit call. 1061 # (NOTE: The widget enters a blocking state) 1062 self.__committimer.start() 1063 if self.__watcher is not None: 1064 self.__cancel_task() 1065 self.setBlocking(True) 1066 1067 def commit(self): 1068 """ 1069 Commit the current state and submit the load task for execution. 1070 1071 Note 1072 ---- 1073 Any existing pending task is canceled. 1074 """ 1075 self.__committimer.stop() 1076 if self.__watcher is not None: 1077 self.__cancel_task() 1078 self.error() 1079 1080 item = self.current_item() 1081 if item is None: 1082 return 1083 path = item.path() 1084 opts = item.options() 1085 if not isinstance(opts, Options): 1086 return 1087 1088 task = state = TaskState() 1089 state.future = ... 1090 state.watcher = qconcurrent.FutureWatcher() 1091 state.progressChanged.connect( 1092 self.__set_read_progress, Qt.DirectConnection) 1093 1094 def progress_(i, j): 1095 task.emitProgressChangedOrCancel(i, j) 1096 1097 task.future = self.__executor.submit( 1098 clear_stack_on_cancel(load_csv), 1099 path, opts, progress_, self.compatibility_mode 1100 ) 1101 task.watcher.setFuture(task.future) 1102 w = task.watcher 1103 w.done.connect(self.__handle_result) 1104 w.progress = state 1105 self.__watcher = w 1106 self.__set_running_state() 1107 1108 @Slot('qint64', 'qint64') 1109 def __set_read_progress(self, read, count): 1110 if count > 0: 1111 self.progressBarSet(100 * read / count) 1112 1113 def __cancel_task(self): 1114 # Cancel and dispose of the current task 1115 assert self.__watcher is not None 1116 w = self.__watcher 1117 w.future().cancel() 1118 w.progress.cancel = True 1119 w.done.disconnect(self.__handle_result) 1120 w.progress.progressChanged.disconnect(self.__set_read_progress) 1121 self.__watcher = None 1122 1123 def cancel(self): 1124 """ 1125 Cancel current pending or executing task. 1126 """ 1127 if self.__watcher is not None: 1128 self.__cancel_task() 1129 self.__clear_running_state() 1130 self.setStatusMessage("Cancelled") 1131 self.summary_text.setText( 1132 "<div>Cancelled<br/><small>Press 'Reload' to try again</small></div>" 1133 ) 1134 1135 def __set_running_state(self): 1136 self.progressBarInit() 1137 self.setBlocking(True) 1138 self.setStatusMessage("Running") 1139 self.cancel_button.setEnabled(True) 1140 self.load_button.setText("Restart") 1141 path = self.current_item().path() 1142 self.Error.clear() 1143 self.summary_text.setText( 1144 "<div>Loading: <i>{}</i><br/>".format(prettyfypath(path)) 1145 ) 1146 1147 def __clear_running_state(self, ): 1148 self.progressBarFinished() 1149 self.setStatusMessage("") 1150 self.setBlocking(False) 1151 self.cancel_button.setEnabled(False) 1152 self.load_button.setText("Reload") 1153 1154 def __set_error_state(self, err): 1155 self.Error.clear() 1156 if isinstance(err, UnicodeDecodeError): 1157 self.Error.encoding_error(exc_info=err) 1158 else: 1159 self.Error.error(exc_info=err) 1160 1161 path = self.current_item().path() 1162 basename = os.path.basename(path) 1163 if isinstance(err, UnicodeDecodeError): 1164 text = ( 1165 "<div><i>{basename}</i> was not loaded due to a text encoding " 1166 "error. The file might be saved in an unknown or invalid " 1167 "encoding, or it might be a binary file.</div>" 1168 ).format( 1169 basename=escape(basename) 1170 ) 1171 else: 1172 text = ( 1173 "<div><i>{basename}</i> was not loaded due to an error:" 1174 "<p style='white-space: pre;'>{err}</p>" 1175 ).format( 1176 basename=escape(basename), 1177 err="".join(traceback.format_exception_only(type(err), err)) 1178 ) 1179 self.summary_text.setText(text) 1180 1181 def __clear_error_state(self): 1182 self.Error.error.clear() 1183 self.summary_text.setText("") 1184 1185 def onDeleteWidget(self): 1186 """Reimplemented.""" 1187 if self.__watcher is not None: 1188 self.__cancel_task() 1189 self.__executor.shutdown() 1190 super().onDeleteWidget() 1191 1192 @Slot(object) 1193 def __handle_result(self, f): 1194 # type: (qconcurrent.Future[pd.DataFrame]) -> None 1195 assert f.done() 1196 assert f is self.__watcher.future() 1197 self.__watcher = None 1198 self.__clear_running_state() 1199 1200 try: 1201 df = f.result() 1202 assert isinstance(df, pd.DataFrame) 1203 except pandas.errors.EmptyDataError: 1204 df = pd.DataFrame({}) 1205 except Exception as e: # pylint: disable=broad-except 1206 self.__set_error_state(e) 1207 df = None 1208 else: 1209 self.__clear_error_state() 1210 1211 if df is not None: 1212 table = pandas_to_table(df) 1213 filename = self.current_item().path() 1214 table.name = os.path.splitext(os.path.split(filename)[-1])[0] 1215 else: 1216 table = None 1217 self.Outputs.data_frame.send(df) 1218 self.Outputs.data.send(table) 1219 self._update_status_messages(table) 1220 1221 def _update_status_messages(self, data): 1222 if data is None: 1223 return 1224 1225 def pluralize(seq): 1226 return "s" if len(seq) != 1 else "" 1227 1228 summary = ("{n_instances} row{plural_1}, " 1229 "{n_features} feature{plural_2}, " 1230 "{n_meta} meta{plural_3}").format( 1231 n_instances=len(data), plural_1=pluralize(data), 1232 n_features=len(data.domain.attributes), 1233 plural_2=pluralize(data.domain.attributes), 1234 n_meta=len(data.domain.metas), 1235 plural_3=pluralize(data.domain.metas)) 1236 self.summary_text.setText(summary) 1237 1238 def itemsFromSettings(self): 1239 # type: () -> List[Tuple[str, Options]] 1240 """ 1241 Return items from local history. 1242 """ 1243 s = self._local_settings() 1244 items_ = QSettings_readArray(s, "recent", OWCSVFileImport.SCHEMA) 1245 items = [] # type: List[Tuple[str, Options]] 1246 for item in items_: 1247 path = item.get("path", "") 1248 if not path: 1249 continue 1250 opts_json = item.get("options", "") 1251 try: 1252 opts = Options.from_dict(json.loads(opts_json)) 1253 except (csv.Error, LookupError, TypeError, json.JSONDecodeError): 1254 _log.error("Could not reconstruct options for '%s'", path, 1255 exc_info=True) 1256 else: 1257 items.append((path, opts)) 1258 return items[::-1] 1259 1260 def _replacements(self) -> Mapping[str, str]: 1261 replacements = [] 1262 basedir = self.workflowEnv().get("basedir", None) 1263 if basedir is not None: 1264 replacements += [('basedir', basedir)] 1265 return dict(replacements) 1266 1267 def _saveState(self): 1268 session_items = [] 1269 model = self.import_items_model 1270 for item in map(model.item, range(model.rowCount())): 1271 if isinstance(item, ImportItem) and item.data(ImportItem.IsSessionItemRole): 1272 vp = item.data(VarPathItem.VarPathRole) 1273 session_items.append((vp.as_dict(), item.options().as_dict())) 1274 self._session_items_v2 = session_items 1275 1276 def _restoreState(self): 1277 # Restore the state. Merge session (workflow) items with the 1278 # local history. 1279 model = self.import_items_model 1280 model.setReplacementEnv(self._replacements()) 1281 1282 # local history 1283 items = self.itemsFromSettings() 1284 # stored session items 1285 sitems = [] 1286 # replacements = self._replacements() 1287 for p, m in self._session_items_v2: 1288 try: 1289 p, m = (PathItem.from_dict(p), Options.from_dict(m)) 1290 except (csv.Error, LookupError, ValueError): 1291 _log.error("Failed to restore '%s'", p, exc_info=True) 1292 else: 1293 sitems.append((p, m, True)) 1294 1295 items = sitems + [(PathItem.AbsPath(p), m, False) for p, m in items] 1296 items = unique_everseen(items, key=lambda t: t[0]) 1297 curr = self.recent_combo.currentIndex() 1298 if curr != -1: 1299 currentpath = self.recent_combo.currentData(ImportItem.PathRole) 1300 else: 1301 currentpath = None 1302 1303 for path, options, is_session in items: 1304 item = ImportItem.fromPath(path) 1305 item.setOptions(options) 1306 item.setData(is_session, ImportItem.IsSessionItemRole) 1307 model.appendRow(item) 1308 1309 if currentpath: 1310 idx = self.recent_combo.findData(currentpath, ImportItem.PathRole) 1311 elif model.data(model.index(0, 0), ImportItem.IsSessionItemRole): 1312 # restore last (current) session item 1313 idx = 0 1314 else: 1315 idx = -1 1316 self.recent_combo.setCurrentIndex(idx) 1317 1318 @classmethod 1319 def migrate_settings(cls, settings, version): 1320 if not version or version < 2: 1321 settings["compatibility_mode"] = True 1322 1323 if version is not None and version < 3: 1324 items_ = settings.pop("_session_items", []) 1325 items_v2 = [(PathItem.AbsPath(p).as_dict(), m) for p, m in items_] 1326 settings["_session_items_v2"] = items_v2 1327 1328 1329@singledispatch 1330def sniff_csv(file, samplesize=2 ** 20, delimiters=None): 1331 sniffer = csv.Sniffer() 1332 sample = file.read(samplesize) 1333 dialect = sniffer.sniff(sample, delimiters=delimiters) 1334 dialect = textimport.Dialect( 1335 dialect.delimiter, dialect.quotechar, 1336 dialect.escapechar, dialect.doublequote, 1337 dialect.skipinitialspace, dialect.quoting 1338 ) 1339 has_header = HeaderSniffer(dialect).has_header(sample) 1340 return dialect, has_header 1341 1342 1343class HeaderSniffer(csv.Sniffer): 1344 def __init__(self, dialect: csv.Dialect): 1345 super().__init__() 1346 self.dialect = dialect 1347 1348 def sniff(self, *_args, **_kwargs): # pylint: disable=signature-differs 1349 # return fixed constant dialect, has_header sniffs dialect itself, 1350 # so it can't detect headers for a predefined dialect 1351 return self.dialect 1352 1353 1354@sniff_csv.register(str) 1355@sniff_csv.register(bytes) 1356def sniff_csv_with_path(path, encoding="utf-8", samplesize=2 ** 20, delimiters=None): 1357 with _open(path, "rt", encoding=encoding) as f: 1358 return sniff_csv(f, samplesize, delimiters) 1359 1360 1361def _open(path, mode, encoding=None): 1362 # type: (str, str, Optional[str]) -> typing.IO[Any] 1363 """ 1364 Open a local file `path` for reading. The file may be gzip, bz2 or zip 1365 compressed. 1366 1367 If a zip archive then a single archive member is expected. 1368 1369 Parameters 1370 ---------- 1371 path : str 1372 File system path 1373 mode : str 1374 'r', 'rb' or 'rt' 1375 encoding : Optional[str] 1376 Optional text encoding, for opening in text mode. 1377 1378 Returns 1379 ------- 1380 stream: io.BaseIO 1381 A stream opened for reading. 1382 """ 1383 if mode not in {'r', 'rb', 'rt'}: 1384 raise ValueError('r') 1385 _, ext = os.path.splitext(path) 1386 ext = ext.lower() 1387 if ext == ".gz": 1388 return gzip.open(path, mode, encoding=encoding) 1389 elif ext == ".bz2": 1390 return bz2.open(path, mode, encoding=encoding) 1391 elif ext == ".xz": 1392 return lzma.open(path, mode, encoding=encoding) 1393 elif ext == ".zip": 1394 arh = zipfile.ZipFile(path, 'r') 1395 filelist = arh.infolist() 1396 if len(filelist) == 1: 1397 f = arh.open(filelist[0], 'r') 1398 # patch the f.close to also close the main archive file 1399 f_close = f.close 1400 1401 def close_(): 1402 f_close() 1403 arh.close() 1404 f.close = close_ 1405 if 't' in mode: 1406 f = io.TextIOWrapper(f, encoding=encoding) 1407 return f 1408 else: 1409 raise ValueError("Expected a single file in the archive.") 1410 else: 1411 return open(path, mode, encoding=encoding) 1412 1413 1414compression_types = [ 1415 "application/gzip", "application/zip", 1416 "application/x-xz", "application/x-bzip", 1417 # application/x-lz4 1418] 1419 1420 1421def _mime_type_for_path(path): 1422 # type: (str) -> QMimeType 1423 """ 1424 Return the mime type of the file on a local filesystem. 1425 1426 In case the path is a compressed file return the mime type of its contents 1427 1428 Parameters 1429 ---------- 1430 path : str 1431 Local filesystem path 1432 1433 Returns 1434 ------- 1435 mimetype: QMimeType 1436 """ 1437 db = QMimeDatabase() 1438 mtype = db.mimeTypeForFile(path, QMimeDatabase.MatchDefault) 1439 if any(mtype.inherits(t) for t in compression_types): 1440 # peek contents 1441 try: 1442 with _open(path, "rb") as f: 1443 sample = f.read(4096) 1444 except Exception: # pylint: disable=broad-except 1445 sample = b'' 1446 mtype = db.mimeTypeForData(sample) 1447 return mtype 1448 1449 1450NA_DEFAULT = ["", "?", ".", "~", "nan", "NAN", "NaN", "N/A", "n/a", "NA"] 1451 1452NA_VALUES = { 1453 ColumnType.Numeric: NA_DEFAULT, 1454 ColumnType.Categorical: NA_DEFAULT, 1455 ColumnType.Time: NA_DEFAULT + ["NaT", "NAT"], 1456 ColumnType.Text: [], 1457 ColumnType.Auto: NA_DEFAULT, 1458} 1459 1460 1461def load_csv(path, opts, progress_callback=None, compatibility_mode=False): 1462 # type: (Union[AnyStr, BinaryIO], Options, Optional[Callable[[int, int], None]], bool) -> pd.DataFrame 1463 def dtype(coltype): 1464 # type: (ColumnType) -> Optional[str] 1465 if coltype == ColumnType.Numeric: 1466 return "float" 1467 elif coltype == ColumnType.Categorical: 1468 return "category" 1469 elif coltype == ColumnType.Time: 1470 return "object" 1471 elif coltype == ColumnType.Text: 1472 return "object" 1473 elif coltype == ColumnType.Skip: 1474 return None 1475 elif coltype == ColumnType.Auto: 1476 return None 1477 else: 1478 raise TypeError 1479 1480 def expand(ranges): 1481 # type: (Iterable[Tuple[range, T]]) -> Iterable[Tuple[int, T]] 1482 return ((i, x) for r, x in ranges for i in r) 1483 1484 dtypes = {i: dtype(c) for i, c in expand(opts.columntypes)} 1485 dtypes = {i: dtp for i, dtp in dtypes.items() 1486 if dtp is not None and dtp != ColumnType.Auto} 1487 1488 columns_ignored = {i for i, c in expand(opts.columntypes) 1489 if c == ColumnType.Skip} 1490 dtcols = {i for i, c in expand(opts.columntypes) 1491 if c == ColumnType.Time} 1492 parse_dates = sorted(dtcols) 1493 na_values = {i: NA_VALUES.get(c, NA_DEFAULT) 1494 for i, c in expand(opts.columntypes)} 1495 if not parse_dates: 1496 parse_dates = False 1497 1498 # fixup header indices to account for skipped rows (header row indices 1499 # pick rows after skiprows) 1500 1501 hspec = sorted(opts.rowspec, key=lambda t: t[0].start) 1502 header_ranges = [] 1503 nskiped = 0 1504 for range_, state in hspec: 1505 if state == RowSpec.Skipped: 1506 nskiped += len(range_) 1507 elif state == RowSpec.Header: 1508 header_ranges.append( 1509 range(range_.start - nskiped, range_.stop - nskiped) 1510 ) 1511 headers = [i for r in header_ranges for i in r] 1512 skiprows = [row for r, st in hspec if st == RowSpec.Skipped for row in r] 1513 1514 if not headers: 1515 header = None 1516 prefix = "X." 1517 1518 elif len(headers) == 1: 1519 header = headers[0] 1520 prefix = None 1521 else: 1522 header = headers 1523 prefix = None 1524 1525 if not skiprows: 1526 skiprows = None 1527 1528 numbers_format_kwds = {} 1529 1530 if opts.decimal_separator != ".": 1531 numbers_format_kwds["decimal"] = opts.decimal_separator 1532 1533 if opts.group_separator != "": 1534 numbers_format_kwds["thousands"] = opts.group_separator 1535 1536 if numbers_format_kwds: 1537 # float_precision = "round_trip" cannot handle non c-locale decimal and 1538 # thousands sep (https://github.com/pandas-dev/pandas/issues/35365). 1539 # Fallback to 'high'. 1540 numbers_format_kwds["float_precision"] = "high" 1541 else: 1542 numbers_format_kwds["float_precision"] = "round_trip" 1543 1544 with ExitStack() as stack: 1545 if isinstance(path, (str, bytes)): 1546 f = stack.enter_context(_open(path, 'rb')) 1547 elif isinstance(path, (io.RawIOBase, io.BufferedIOBase)) or \ 1548 hasattr(path, "read"): 1549 f = path 1550 else: 1551 raise TypeError() 1552 file = TextReadWrapper( 1553 f, encoding=opts.encoding, 1554 progress_callback=progress_callback) 1555 stack.callback(file.detach) 1556 df = pd.read_csv( 1557 file, sep=opts.dialect.delimiter, dialect=opts.dialect, 1558 skipinitialspace=opts.dialect.skipinitialspace, 1559 header=header, skiprows=skiprows, 1560 dtype=dtypes, parse_dates=parse_dates, prefix=prefix, 1561 na_values=na_values, keep_default_na=False, 1562 **numbers_format_kwds 1563 ) 1564 1565 # for older workflows avoid guessing type guessing 1566 if not compatibility_mode: 1567 df = guess_types(df, dtypes, columns_ignored) 1568 1569 if columns_ignored: 1570 # TODO: use 'usecols' parameter in `read_csv` call to 1571 # avoid loading/parsing the columns 1572 df.drop( 1573 columns=[df.columns[i] for i in columns_ignored 1574 if i < len(df.columns)], 1575 inplace=True 1576 ) 1577 return df 1578 1579 1580def guess_types( 1581 df: pd.DataFrame, dtypes: Dict[int, str], columns_ignored: Set[int] 1582) -> pd.DataFrame: 1583 """ 1584 Guess data type for variables according to values. 1585 1586 Parameters 1587 ---------- 1588 df 1589 Data frame 1590 dtypes 1591 The dictionary with data types set by user. We will guess values only 1592 for columns that does not have data type defined. 1593 columns_ignored 1594 List with indices of ignored columns. Ignored columns are skipped. 1595 1596 Returns 1597 ------- 1598 A data frame with changed dtypes according to the strategy. 1599 """ 1600 for i, col in enumerate(df): 1601 # only when automatic is set in widget dialog 1602 if dtypes.get(i, None) is None and i not in columns_ignored: 1603 df[col] = guess_data_type(df[col]) 1604 return df 1605 1606 1607def guess_data_type(col: pd.Series) -> pd.Series: 1608 """ 1609 Guess column types. Logic is same than in guess_data_type from io_utils 1610 module. This function only change the dtype of the column such that later 1611 correct Orange.data.variable is used. 1612 Logic: 1613 - if can converted to date-time (ISO) -> TimeVariable 1614 - if numeric (only numbers) 1615 - only values {0, 1} or {1, 2} -> DiscreteVariable 1616 - else -> ContinuousVariable 1617 - if not numbers: 1618 - num_unique_values < len(data) ** 0.7 and < 100 -> DiscreteVariable 1619 - else -> StringVariable 1620 1621 Parameters 1622 ---------- 1623 col 1624 Data column 1625 1626 Returns 1627 ------- 1628 Data column with correct dtype 1629 """ 1630 def parse_dates(s): 1631 """ 1632 This is an extremely fast approach to datetime parsing. 1633 For large data, the same dates are often repeated. Rather than 1634 re-parse these, we store all unique dates, parse them, and 1635 use a lookup to convert all dates. 1636 """ 1637 try: 1638 dates = {date: pd.to_datetime(date) for date in s.unique()} 1639 except ValueError: 1640 return None 1641 return s.map(dates) 1642 1643 if pdtypes.is_numeric_dtype(col): 1644 unique_values = col.unique() 1645 if len(unique_values) <= 2 and ( 1646 len(np.setdiff1d(unique_values, [0, 1])) == 0 1647 or len(np.setdiff1d(unique_values, [1, 2])) == 0): 1648 return col.astype("category") 1649 else: # object 1650 # try parse as date - if None not a date 1651 parsed_col = parse_dates(col) 1652 if parsed_col is not None: 1653 return parsed_col 1654 unique_values = col.unique() 1655 if len(unique_values) < 100 and len(unique_values) < len(col)**0.7: 1656 return col.astype("category") 1657 return col 1658 1659 1660def clear_stack_on_cancel(f): 1661 """ 1662 A decorator that catches the TaskState.UserCancelException exception 1663 and clears the exception's traceback to remove local references. 1664 1665 Parameters 1666 ---------- 1667 f : callable 1668 1669 Returns 1670 ------- 1671 wrapped : callable 1672 """ 1673 def wrapper(*args, **kwargs): 1674 try: 1675 return f(*args, **kwargs) 1676 except TaskState.UserCancelException as e: 1677 # TODO: Is this enough to allow immediate gc of the stack? 1678 # How does it chain across cython code? 1679 # Maybe just return None. 1680 e = e.with_traceback(None) 1681 e.__context__ = None 1682 e.__cause__ = None 1683 raise e 1684 except BaseException as e: 1685 traceback.clear_frames(e.__traceback__) 1686 raise 1687 1688 return wrapper 1689 1690 1691class TaskState(QObject, PyOwned): 1692 class UserCancelException(BaseException): 1693 """User interrupt exception.""" 1694 1695 #: Signal emitted with the current read progress. First value is the current 1696 #: progress state, second value is the total progress to complete 1697 #: (-1 if unknown) 1698 progressChanged = Signal('qint64', 'qint64') 1699 __progressChanged = Signal('qint64', 'qint64') 1700 #: Was cancel requested. 1701 cancel = False 1702 1703 def __init__(self, *args, **kwargs): 1704 super().__init__(*args, **kwargs) 1705 # route the signal via this object's queue 1706 self.__progressChanged.connect( 1707 self.progressChanged, Qt.QueuedConnection) 1708 1709 def emitProgressChangedOrCancel(self, current, total): 1710 # type: (int, int) -> None 1711 """ 1712 Emit the progressChanged signal with `current` and `total`. 1713 """ 1714 if self.cancel: 1715 raise TaskState.UserCancelException() 1716 else: 1717 self.__progressChanged.emit(current, total) 1718 1719 1720class TextReadWrapper(io.TextIOWrapper): 1721 """ 1722 TextIOWrapper reporting the read progress. 1723 1724 Assuming a single forward read pass. 1725 """ 1726 1727 #: A `Callable[[int, int], []]` called when the file position is 1728 #: advanced by read; called with current byte position and total 1729 #: file size. 1730 progress_callback = ... # type: Callable[[int, int], None] 1731 1732 def __init__(self, buffer, *args, progress_callback=None, **kwargs): 1733 super().__init__(buffer, *args, **kwargs) 1734 if progress_callback is None: 1735 def progress_callback(i, j): # pylint: disable=unused-argument 1736 pass 1737 self.progress_callback = progress_callback 1738 try: 1739 self.__size = os.fstat(buffer.fileno()).st_size 1740 except OSError: 1741 self.__size = -1 1742 1743 def read(self, size=-1): 1744 s = super().read(size) 1745 # try to go around any gzip/bz2/lzma wrappers to the base 1746 # raw file (does not work for zipfile.ZipExtFile; should 1747 # dispatch on buffer type) 1748 try: 1749 fd = self.buffer.fileno() 1750 except (AttributeError, io.UnsupportedOperation): 1751 pos = -1 1752 else: 1753 try: 1754 pos = os.lseek(fd, 0, os.SEEK_CUR) 1755 except OSError: 1756 pos = -1 1757 1758 self.progress_callback(pos, self.__size) 1759 return s 1760 1761 1762def index_where(iterable, pred): 1763 # type: (Iterable[T], Callable[[T], bool]) -> Optional[int] 1764 """ 1765 Return the (first) index of el in `iterable` where `pred(el)` returns True. 1766 1767 If no element matches return `None`. 1768 """ 1769 for i, el in enumerate(iterable): 1770 if pred(el): 1771 return i 1772 return None 1773 1774 1775def pandas_to_table(df): 1776 # type: (pd.DataFrame) -> Orange.data.Table 1777 """ 1778 Convert a pandas.DataFrame to a Orange.data.Table instance. 1779 """ 1780 index = df.index 1781 if not isinstance(index, pd.RangeIndex): 1782 df = df.reset_index() 1783 1784 columns = [] # type: List[Tuple[Orange.data.Variable, np.ndarray]] 1785 1786 for header, series in df.items(): # type: (Any, pd.Series) 1787 if pdtypes.is_categorical_dtype(series): 1788 coldata = series.values # type: pd.Categorical 1789 categories = natural_sorted(str(c) for c in coldata.categories) 1790 var = Orange.data.DiscreteVariable.make( 1791 str(header), values=categories 1792 ) 1793 # Remap the coldata into the var.values order/set 1794 coldata = pd.Categorical( 1795 coldata.astype("str"), categories=var.values 1796 ) 1797 codes = coldata.codes 1798 assert np.issubdtype(codes.dtype, np.integer) 1799 orangecol = np.array(codes, dtype=np.float) 1800 orangecol[codes < 0] = np.nan 1801 elif pdtypes.is_datetime64_any_dtype(series): 1802 # Check that this converts tz local to UTC 1803 series = series.astype(np.dtype("M8[ns]")) 1804 coldata = series.values # type: np.ndarray 1805 assert coldata.dtype == "M8[ns]" 1806 mask = np.isnat(coldata) 1807 orangecol = coldata.astype(np.int64) / 10 ** 9 1808 orangecol[mask] = np.nan 1809 var = Orange.data.TimeVariable.make(str(header)) 1810 var.have_date = var.have_time = 1 1811 elif pdtypes.is_object_dtype(series): 1812 coldata = series.fillna('').values 1813 assert isinstance(coldata, np.ndarray) 1814 orangecol = coldata 1815 var = Orange.data.StringVariable.make(str(header)) 1816 elif pdtypes.is_integer_dtype(series): 1817 coldata = series.values 1818 var = Orange.data.ContinuousVariable.make(str(header)) 1819 var.number_of_decimals = 0 1820 orangecol = coldata.astype(np.float64) 1821 elif pdtypes.is_numeric_dtype(series): 1822 orangecol = series.values.astype(np.float64) 1823 var = Orange.data.ContinuousVariable.make(str(header)) 1824 else: 1825 warnings.warn( 1826 "Column '{}' with dtype: {} skipped." 1827 .format(header, series.dtype), 1828 UserWarning 1829 ) 1830 continue 1831 columns.append((var, orangecol)) 1832 1833 cols_x = [(var, col) for var, col in columns if var.is_primitive()] 1834 cols_m = [(var, col) for var, col in columns if not var.is_primitive()] 1835 1836 variables = [v for v, _ in cols_x] 1837 if cols_x: 1838 X = np.column_stack([a for _, a in cols_x]) 1839 else: 1840 X = np.empty((df.shape[0], 0), dtype=np.float) 1841 metas = [v for v, _ in cols_m] 1842 if cols_m: 1843 M = np.column_stack([a for _, a in cols_m]) 1844 else: 1845 M = None 1846 1847 domain = Orange.data.Domain(variables, metas=metas) 1848 return Orange.data.Table.from_numpy(domain, X, None, M) 1849 1850 1851def main(argv=None): # pragma: no cover 1852 app = QApplication(argv or []) 1853 w = OWCSVFileImport() 1854 w.show() 1855 w.raise_() 1856 app.exec() 1857 w.saveSettings() 1858 w.onDeleteWidget() 1859 return 0 1860 1861 1862if __name__ == "__main__": # pragma: no cover 1863 sys.exit(main(sys.argv)) 1864