1import os 2from collections import Counter, defaultdict 3from copy import copy 4from numbers import Integral 5from itertools import chain 6from typing import Union, Optional, List, Tuple 7 8import nltk 9import numpy as np 10import scipy.sparse as sp 11from gensim import corpora 12 13from Orange.data import ( 14 Variable, 15 ContinuousVariable, 16 DiscreteVariable, 17 Domain, 18 RowInstance, 19 Table, 20 StringVariable, 21) 22from Orange.preprocess.transformation import Identity 23# uncomment when Orange3==3.27 is available 24# from Orange.data.util import get_unique_names 25# remove when Orange3==3.27 is available 26from orangecontrib.text.vectorization.base import get_unique_names 27from orangecontrib.text.vectorization import BowVectorizer 28 29 30def get_sample_corpora_dir(): 31 path = os.path.dirname(__file__) 32 directory = os.path.join(path, 'datasets') 33 return os.path.abspath(directory) 34 35 36def _check_arrays(*arrays): 37 for a in arrays: 38 if not (a is None or isinstance(a, np.ndarray) or sp.issparse(a)): 39 raise TypeError('Argument {} should be of type np.array, sparse or None.'.format(a)) 40 41 lengths = set(a.shape[0] for a in arrays if a is not None) 42 if len(lengths) > 1: 43 raise ValueError('Leading dimension mismatch') 44 45 return lengths.pop() if len(lengths) else 0 46 47 48class Corpus(Table): 49 """Internal class for storing a corpus.""" 50 51 def __new__(cls, *args, **kwargs): 52 """Bypass Table.__new__.""" 53 return object.__new__(cls) 54 55 def __init__(self, domain=None, X=None, Y=None, metas=None, W=None, 56 text_features=None, ids=None): 57 """ 58 Args: 59 domain (Orange.data.Domain): the domain for this Corpus 60 X (numpy.ndarray): attributes 61 Y (numpy.ndarray): class variables 62 metas (numpy.ndarray): meta attributes; e.g. text 63 W (numpy.ndarray): instance weights 64 text_features (list): meta attributes that are used for 65 text mining. Infer them if None. 66 ids (numpy.ndarray): Indices 67 """ 68 n_doc = _check_arrays(X, Y, metas) 69 70 self.X = X if X is not None else np.zeros((n_doc, 0)) 71 self.Y = Y if Y is not None else np.zeros((n_doc, 0)) 72 self.metas = metas if metas is not None else np.zeros((n_doc, 0)) 73 self.W = W if W is not None else np.zeros((n_doc, 0)) 74 self.domain = domain 75 self.text_features = [] # list of text features for mining 76 self._tokens = None 77 self._dictionary = None 78 self._ngrams_corpus = None 79 self.ngram_range = (1, 1) 80 self.attributes = {} 81 self.pos_tags = None 82 from orangecontrib.text.preprocess import PreprocessorList 83 self.__used_preprocessor = PreprocessorList([]) # required for compute values 84 self._titles: Optional[np.ndarray] = None 85 self._pp_documents = None # preprocessed documents 86 87 if domain is not None and text_features is None: 88 self._infer_text_features() 89 elif domain is not None: 90 self.set_text_features(text_features) 91 92 if ids is not None: 93 self.ids = ids 94 else: 95 Table._init_ids(self) 96 self._set_unique_titles() 97 98 @property 99 def used_preprocessor(self): 100 return self.__used_preprocessor # type: PreprocessorList 101 102 @used_preprocessor.setter 103 def used_preprocessor(self, pp): 104 from orangecontrib.text.preprocess import PreprocessorList, Preprocessor 105 106 if isinstance(pp, PreprocessorList): 107 self.__used_preprocessor = PreprocessorList(list(pp.preprocessors)) 108 elif isinstance(pp, Preprocessor): 109 self.__used_preprocessor.preprocessors.append(pp) 110 else: 111 raise NotImplementedError 112 113 def _find_identical_feature(self, feature: Variable) -> Optional[Variable]: 114 """ 115 Find a renamed feature in the domain which is identical to a feature. 116 117 Parameters 118 ---------- 119 feature 120 A variable to find an identical variable in the domain. 121 122 Returns 123 ------- 124 Variable which is identical to a feature (have different name but has 125 Identity(feature) in compute value. 126 """ 127 for var in chain(self.domain.variables, self.domain.metas): 128 if ( 129 var == feature 130 or isinstance(var.compute_value, Identity) 131 and var.compute_value.variable == feature 132 ): 133 return var 134 return None 135 136 def set_text_features(self, feats: Optional[List[Variable]]) -> None: 137 """ 138 Select which meta-attributes to include when mining text. 139 140 Parameters 141 ---------- 142 feats 143 List of text features to include. If None infer them. 144 """ 145 if feats is not None: 146 feats = copy(feats) # copy to not edit passed array inplace 147 for i, f in enumerate(feats): 148 if f not in chain(self.domain.variables, self.domain.metas): 149 # if not exact feature in the domain, it may be renamed 150 # find identity - renamed feature 151 id_feat = self._find_identical_feature(f) 152 if id_feat is not None: 153 feats[i] = id_feat 154 else: 155 raise ValueError('Feature "{}" not found.'.format(f)) 156 if len(set(feats)) != len(feats): 157 raise ValueError('Text features must be unique.') 158 self.text_features = feats 159 else: 160 self._infer_text_features() 161 self._tokens = None # invalidate tokens 162 163 def set_title_variable( 164 self, title_variable: Union[StringVariable, str, None] 165 ) -> None: 166 """ 167 Set the title attribute. Only one column can be a title attribute. 168 169 Parameters 170 ---------- 171 title_variable 172 Variable that need to be set as a title variable. If it is None, 173 do not set a variable. 174 """ 175 for a in self.domain.variables + self.domain.metas: 176 a.attributes.pop("title", None) 177 178 if title_variable and title_variable in self.domain: 179 self.domain[title_variable].attributes["title"] = True 180 181 self._set_unique_titles() 182 183 def _set_unique_titles(self): 184 """ 185 Define self._titles variable as a list of titles (a title for each 186 document). It is used to have an unique title for each document. In 187 case when the document have the same title as the other document we 188 put a number beside. 189 """ 190 if self.domain is None: 191 return 192 attrs = [attr for attr in 193 chain(self.domain.variables, self.domain.metas) 194 if attr.attributes.get('title', False)] 195 196 if attrs: 197 self._titles = np.array(self._unique_titles( 198 self.documents_from_features(attrs))) 199 else: 200 self._titles = np.array([ 201 'Document {}'.format(i + 1) for i in range(len(self))]) 202 203 @staticmethod 204 def _unique_titles(titles: List[str]) -> List[str]: 205 """ 206 Function adds numbers to the non-unique values fo the title. 207 208 Parameters 209 ---------- 210 titles 211 List of titles - not necessary unique 212 213 Returns 214 ------- 215 List with unique titles. 216 """ 217 counts = Counter(titles) 218 cur_appearances = defaultdict(int) 219 new_titles = [] 220 for t in titles: 221 if counts[t] > 1: 222 cur_appearances[t] += 1 223 t += f" ({cur_appearances[t]})" 224 new_titles.append(t) 225 return new_titles 226 227 def _infer_text_features(self): 228 """ 229 Infer which text features to use. If nothing was provided 230 in the file header, use the first text feature. 231 """ 232 include_feats = [] 233 first = None 234 for attr in self.domain.metas: 235 if attr.is_string: 236 if first is None: 237 first = attr 238 if attr.attributes.get('include', 'False') == 'True': 239 include_feats.append(attr) 240 if len(include_feats) == 0 and first: 241 include_feats.append(first) 242 self.set_text_features(include_feats) 243 244 def extend_corpus(self, metadata, Y): 245 """ 246 Append documents to corpus. 247 248 Args: 249 metadata (numpy.ndarray): Meta data 250 Y (numpy.ndarray): Class variables 251 """ 252 if np.prod(self.X.shape) != 0: 253 raise ValueError("Extending corpus only works when X is empty" 254 "while the shape of X is {}".format(self.X.shape)) 255 256 self.metas = np.vstack((self.metas, metadata)) 257 258 cv = self.domain.class_var 259 for val in set(filter(None, Y)): 260 if val not in cv.values: 261 cv.add_value(val) 262 new_Y = np.array([cv.to_val(i) for i in Y])[:, None] 263 self._Y = np.vstack((self._Y, new_Y)) 264 265 self.X = self.W = np.zeros((self.metas.shape[0], 0)) 266 Table._init_ids(self) 267 268 self._tokens = None # invalidate tokens 269 self._set_unique_titles() 270 271 def extend_attributes( 272 self, X, feature_names, feature_values=None, compute_values=None, 273 var_attrs=None, sparse=False, rename_existing=False 274 ): 275 """ 276 Append features to corpus. If `feature_values` argument is present, 277 features will be Discrete else Continuous. 278 279 Args: 280 X (numpy.ndarray or scipy.sparse.csr_matrix): Features values to append 281 feature_names (list): List of string containing feature names 282 feature_values (list): A list of possible values for Discrete features. 283 compute_values (list): Compute values for corresponding features. 284 var_attrs (dict): Additional attributes appended to variable.attributes. 285 sparse (bool): Whether the features should be marked as sparse. 286 rename_existing (bool): When true and names are not unique rename 287 exiting features; if false rename new features 288 """ 289 def _rename_features(additional_names: List) -> Tuple[List, List, List]: 290 cur_attr = list(self.domain.attributes) 291 cur_class = self.domain.class_var 292 cur_meta = list(self.domain.metas) 293 if rename_existing: 294 current_vars = ( 295 cur_attr + ( 296 [cur_class] if cur_class else []) + cur_meta 297 ) 298 current_names = [a.name for a in current_vars] 299 new_names = get_unique_names( 300 additional_names, current_names, equal_numbers=False 301 ) 302 renamed_vars = [ 303 var.renamed(n) for var, n in zip(current_vars, new_names) 304 ] 305 cur_attr = renamed_vars[:len(cur_attr)] 306 cur_class = renamed_vars[len(cur_attr)] if cur_class else None 307 cur_meta = renamed_vars[-len(cur_meta):] 308 return cur_attr, cur_class, cur_meta 309 310 if sp.issparse(self.X) or sp.issparse(X): 311 X = sp.hstack((self.X, X)).tocsr() 312 else: 313 X = np.hstack((self.X, X)) 314 315 if compute_values is None: 316 compute_values = [None] * X.shape[1] 317 if feature_values is None: 318 feature_values = [None] * X.shape[1] 319 320 # rename existing variables if required 321 curr_attributes, curr_class_var, curr_metas = _rename_features( 322 feature_names 323 ) 324 if not rename_existing: 325 # rename new feature names if required 326 feature_names = get_unique_names( 327 self.domain, feature_names, equal_numbers=False 328 ) 329 330 additional_attributes = [] 331 for f, values, cv in zip(feature_names, feature_values, compute_values): 332 if values is not None: 333 var = DiscreteVariable(f, values=values, compute_value=cv) 334 else: 335 var = ContinuousVariable(f, compute_value=cv) 336 var.sparse = sparse # don't pass this to constructor so this works with Orange < 3.8.0 337 if cv is not None: # set original variable for cv 338 cv.variable = var 339 if isinstance(var_attrs, dict): 340 var.attributes.update(var_attrs) 341 additional_attributes.append(var) 342 343 new_domain = Domain( 344 attributes=curr_attributes + additional_attributes, 345 class_vars=curr_class_var, 346 metas=curr_metas 347 ) 348 c = Corpus( 349 new_domain, 350 X, 351 self.Y.copy(), 352 self.metas.copy(), 353 self.W.copy(), 354 copy(self.text_features) 355 ) 356 Corpus.retain_preprocessing(self, c) 357 return c 358 359 @property 360 def documents(self): 361 """ Returns a list of strings representing documents — created 362 by joining selected text features. """ 363 return self.documents_from_features(self.text_features) 364 365 @property 366 def pp_documents(self): 367 """ Preprocessed documents (transformed). """ 368 return self._pp_documents or self.documents 369 370 @pp_documents.setter 371 def pp_documents(self, documents): 372 self._pp_documents = documents 373 374 @property 375 def titles(self): 376 """ Returns a list of titles. """ 377 assert self._titles is not None 378 return self._titles 379 380 def documents_from_features(self, feats): 381 """ 382 Args: 383 feats (list): A list fo features to join. 384 385 Returns: a list of strings constructed by joining feats. 386 """ 387 # create a Table where feats are in metas 388 data = Table.from_table(Domain([], [], [i.name for i in feats], 389 source=self.domain), self) 390 391 # When we use only features coming from sparse X data.metas is sparse. 392 # Transform it to dense. 393 if sp.issparse(data.metas): 394 data.metas = data.metas.toarray() 395 396 return [' '.join(f.str_val(val) for f, val in zip(data.domain.metas, row)) 397 for row in data.metas] 398 399 def store_tokens(self, tokens, dictionary=None): 400 """ 401 Args: 402 tokens (list): List of lists containing tokens. 403 """ 404 self._tokens = np.array(tokens, dtype=object) 405 self._dictionary = dictionary or corpora.Dictionary(self.tokens) 406 407 @property 408 def tokens(self): 409 """ 410 np.ndarray: A list of lists containing tokens. If tokens are not yet 411 present, run default preprocessor and return tokens. 412 """ 413 if self._tokens is None: 414 return self._base_tokens()[0] 415 return self._tokens 416 417 def has_tokens(self): 418 """ Return whether corpus is preprocessed or not. """ 419 return self._tokens is not None 420 421 def _base_tokens(self): 422 from orangecontrib.text.preprocess import BASE_TRANSFORMER, \ 423 BASE_TOKENIZER, PreprocessorList 424 425 # don't use anything that requires NLTK data to assure async download 426 base_preprocessors = PreprocessorList([BASE_TRANSFORMER, 427 BASE_TOKENIZER]) 428 corpus = base_preprocessors(self) 429 return corpus.tokens, corpus.dictionary 430 431 @property 432 def dictionary(self): 433 """ 434 corpora.Dictionary: A token to id mapper. 435 """ 436 if self._dictionary is None: 437 return self._base_tokens()[1] 438 return self._dictionary 439 440 def ngrams_iterator(self, join_with=' ', include_postags=False): 441 if self.pos_tags is None: 442 include_postags = False 443 444 if include_postags: 445 data = zip(self.tokens, self.pos_tags) 446 else: 447 data = self.tokens 448 449 if join_with is None: 450 processor = lambda doc, n: nltk.ngrams(doc, n) 451 elif include_postags: 452 processor = lambda doc, n: (join_with.join(token + '_' + tag for token, tag in ngram) 453 for ngram in nltk.ngrams(zip(*doc), n)) 454 else: 455 processor = lambda doc, n: (join_with.join(ngram) for ngram in nltk.ngrams(doc, n)) 456 457 return (list(chain(*(processor(doc, n) 458 for n in range(self.ngram_range[0], self.ngram_range[1]+1)))) 459 for doc in data) 460 461 @property 462 def ngrams_corpus(self): 463 if self._ngrams_corpus is None: 464 return BowVectorizer().transform(self).ngrams_corpus 465 return self._ngrams_corpus 466 467 @ngrams_corpus.setter 468 def ngrams_corpus(self, value): 469 self._ngrams_corpus = value 470 471 @property 472 def ngrams(self): 473 """generator: Ngram representations of documents.""" 474 return self.ngrams_iterator(join_with=' ') 475 476 def copy(self): 477 """Return a copy of the table.""" 478 c = self.__class__(self.domain, self.X.copy(), self.Y.copy(), self.metas.copy(), 479 self.W.copy(), copy(self.text_features)) 480 # since tokens and dictionary are considered immutable copies are not needed 481 c._tokens = self._tokens 482 c._dictionary = self._dictionary 483 c.ngram_range = self.ngram_range 484 c.pos_tags = self.pos_tags 485 c.name = self.name 486 c.used_preprocessor = self.used_preprocessor 487 c._titles = self._titles 488 c._pp_documents = self._pp_documents 489 return c 490 491 @staticmethod 492 def from_documents(documents, name, attributes=None, class_vars=None, metas=None, 493 title_indices=None): 494 """ 495 Create corpus from documents. 496 497 Args: 498 documents (list): List of documents. 499 name (str): Name of the corpus 500 attributes (list): List of tuples (Variable, getter) for attributes. 501 class_vars (list): List of tuples (Variable, getter) for class vars. 502 metas (list): List of tuples (Variable, getter) for metas. 503 title_indices (list): List of indices into domain corresponding to features which will 504 be used as titles. 505 506 Returns: 507 Corpus. 508 """ 509 attributes = attributes or [] 510 class_vars = class_vars or [] 511 metas = metas or [] 512 title_indices = title_indices or [] 513 514 domain = Domain(attributes=[attr for attr, _ in attributes], 515 class_vars=[attr for attr, _ in class_vars], 516 metas=[attr for attr, _ in metas]) 517 518 for ind in title_indices: 519 domain[ind].attributes['title'] = True 520 521 def to_val(attr, val): 522 if isinstance(attr, DiscreteVariable): 523 attr.val_from_str_add(val) 524 return attr.to_val(val) 525 526 if documents: 527 X = np.array([[to_val(attr, func(doc)) for attr, func in attributes] 528 for doc in documents], dtype=np.float64) 529 Y = np.array([[to_val(attr, func(doc)) for attr, func in class_vars] 530 for doc in documents], dtype=np.float64) 531 metas = np.array([[to_val(attr, func(doc)) for attr, func in metas] 532 for doc in documents], dtype=object) 533 else: # assure shapes match the number of columns 534 X = np.empty((0, len(attributes))) 535 Y = np.empty((0, len(class_vars))) 536 metas = np.empty((0, len(metas))) 537 538 corpus = Corpus(X=X, Y=Y, metas=metas, domain=domain, text_features=[]) 539 corpus.name = name 540 return corpus 541 542 def __getitem__(self, key): 543 c = super().__getitem__(key) 544 if isinstance(c, (Corpus, RowInstance)): 545 Corpus.retain_preprocessing(self, c, key) 546 return c 547 548 @classmethod 549 def from_table(cls, domain, source, row_indices=...): 550 t = super().from_table(domain, source, row_indices) 551 c = Corpus(t.domain, t.X, t.Y, t.metas, t.W, ids=t.ids) 552 Corpus.retain_preprocessing(source, c, row_indices) 553 return c 554 555 @classmethod 556 def from_numpy(cls, *args, **kwargs): 557 c = super().from_numpy(*args, **kwargs) 558 c._set_unique_titles() 559 return c 560 561 @classmethod 562 def from_list(cls, domain, rows, weights=None): 563 c = super().from_list(domain, rows, weights) 564 c._set_unique_titles() 565 return c 566 567 @classmethod 568 def from_table_rows(cls, source, row_indices): 569 c = super().from_table_rows(source, row_indices) 570 if hasattr(source, "_titles"): 571 # covering case when from_table_rows called by from_table 572 c._titles = source._titles[row_indices] 573 return c 574 575 @classmethod 576 def from_file(cls, filename): 577 if not os.path.exists(filename): # check the default location 578 abs_path = os.path.join(get_sample_corpora_dir(), filename) 579 if not abs_path.endswith('.tab'): 580 abs_path += '.tab' 581 if not os.path.exists(abs_path): 582 raise FileNotFoundError('File "{}" not found.'.format(filename)) 583 else: 584 filename = abs_path 585 586 table = Table.from_file(filename) 587 corpus = cls(table.domain, table.X, table.Y, table.metas, table.W) 588 return corpus 589 590 @staticmethod 591 def retain_preprocessing(orig, new, key=...): 592 """ Set preprocessing of 'new' object to match the 'orig' object. """ 593 if isinstance(orig, Corpus): 594 if isinstance(key, tuple): # get row selection 595 key = key[0] 596 597 if orig._tokens is not None: # retain preprocessing 598 if isinstance(key, Integral): 599 new._tokens = np.array([orig._tokens[key]]) 600 new.pos_tags = None if orig.pos_tags is None else np.array( 601 [orig.pos_tags[key]]) 602 elif isinstance(key, list) or isinstance(key, np.ndarray) \ 603 or isinstance(key, slice) or isinstance(key, range): 604 new._tokens = orig._tokens[key] 605 new.pos_tags = None if orig.pos_tags is None else orig.pos_tags[key] 606 elif key is Ellipsis: 607 new._tokens = orig._tokens 608 new.pos_tags = orig.pos_tags 609 else: 610 raise TypeError('Indexing by type {} not supported.'.format(type(key))) 611 new._dictionary = orig._dictionary 612 613 if isinstance(new, Corpus): 614 # _find_identical_feature returns non when feature not found 615 # filter this Nones from list 616 new.text_features = list(filter(None, [ 617 new._find_identical_feature(tf) 618 for tf in orig.text_features 619 ])) 620 else: 621 new.text_features = [ 622 tf 623 for tf in orig.text_features 624 if tf in set(new.domain.metas) 625 ] 626 627 new._titles = orig._titles[key] 628 new.ngram_range = orig.ngram_range 629 new.attributes = orig.attributes 630 new.used_preprocessor = orig.used_preprocessor 631 632 def __eq__(self, other): 633 def arrays_equal(a, b): 634 if sp.issparse(a) != sp.issparse(b): 635 return False 636 elif sp.issparse(a) and sp.issparse(b): 637 return (a != b).nnz == 0 638 else: 639 return np.array_equal(a, b) 640 641 return (self.text_features == other.text_features and 642 self._dictionary == other._dictionary and 643 np.array_equal(self._tokens, other._tokens) and 644 arrays_equal(self.X, other.X) and 645 arrays_equal(self.Y, other.Y) and 646 arrays_equal(self.metas, other.metas) and 647 np.array_equal(self.pos_tags, other.pos_tags) and 648 self.domain == other.domain and 649 self.ngram_range == other.ngram_range) 650