1import os
2from collections import Counter, defaultdict
3from copy import copy
4from numbers import Integral
5from itertools import chain
6from typing import Union, Optional, List, Tuple
7
8import nltk
9import numpy as np
10import scipy.sparse as sp
11from gensim import corpora
12
13from Orange.data import (
14    Variable,
15    ContinuousVariable,
16    DiscreteVariable,
17    Domain,
18    RowInstance,
19    Table,
20    StringVariable,
21)
22from Orange.preprocess.transformation import Identity
23# uncomment when Orange3==3.27 is available
24# from Orange.data.util import get_unique_names
25# remove when Orange3==3.27 is available
26from orangecontrib.text.vectorization.base import get_unique_names
27from orangecontrib.text.vectorization import BowVectorizer
28
29
30def get_sample_corpora_dir():
31    path = os.path.dirname(__file__)
32    directory = os.path.join(path, 'datasets')
33    return os.path.abspath(directory)
34
35
36def _check_arrays(*arrays):
37    for a in arrays:
38        if not (a is None or isinstance(a, np.ndarray) or sp.issparse(a)):
39            raise TypeError('Argument {} should be of type np.array, sparse or None.'.format(a))
40
41    lengths = set(a.shape[0] for a in arrays if a is not None)
42    if len(lengths) > 1:
43        raise ValueError('Leading dimension mismatch')
44
45    return lengths.pop() if len(lengths) else 0
46
47
48class Corpus(Table):
49    """Internal class for storing a corpus."""
50
51    def __new__(cls, *args, **kwargs):
52        """Bypass Table.__new__."""
53        return object.__new__(cls)
54
55    def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
56                 text_features=None, ids=None):
57        """
58        Args:
59            domain (Orange.data.Domain): the domain for this Corpus
60            X (numpy.ndarray): attributes
61            Y (numpy.ndarray): class variables
62            metas (numpy.ndarray): meta attributes; e.g. text
63            W (numpy.ndarray): instance weights
64            text_features (list): meta attributes that are used for
65                text mining. Infer them if None.
66            ids (numpy.ndarray): Indices
67        """
68        n_doc = _check_arrays(X, Y, metas)
69
70        self.X = X if X is not None else np.zeros((n_doc, 0))
71        self.Y = Y if Y is not None else np.zeros((n_doc, 0))
72        self.metas = metas if metas is not None else np.zeros((n_doc, 0))
73        self.W = W if W is not None else np.zeros((n_doc, 0))
74        self.domain = domain
75        self.text_features = []    # list of text features for mining
76        self._tokens = None
77        self._dictionary = None
78        self._ngrams_corpus = None
79        self.ngram_range = (1, 1)
80        self.attributes = {}
81        self.pos_tags = None
82        from orangecontrib.text.preprocess import PreprocessorList
83        self.__used_preprocessor = PreprocessorList([])   # required for compute values
84        self._titles: Optional[np.ndarray] = None
85        self._pp_documents = None  # preprocessed documents
86
87        if domain is not None and text_features is None:
88            self._infer_text_features()
89        elif domain is not None:
90            self.set_text_features(text_features)
91
92        if ids is not None:
93            self.ids = ids
94        else:
95            Table._init_ids(self)
96        self._set_unique_titles()
97
98    @property
99    def used_preprocessor(self):
100        return self.__used_preprocessor  # type: PreprocessorList
101
102    @used_preprocessor.setter
103    def used_preprocessor(self, pp):
104        from orangecontrib.text.preprocess import PreprocessorList, Preprocessor
105
106        if isinstance(pp, PreprocessorList):
107            self.__used_preprocessor = PreprocessorList(list(pp.preprocessors))
108        elif isinstance(pp, Preprocessor):
109            self.__used_preprocessor.preprocessors.append(pp)
110        else:
111            raise NotImplementedError
112
113    def _find_identical_feature(self, feature: Variable) -> Optional[Variable]:
114        """
115        Find a renamed feature in the domain which is identical to a feature.
116
117        Parameters
118        ----------
119        feature
120            A variable to find an identical variable in the domain.
121
122        Returns
123        -------
124        Variable which is identical to a feature (have different name but has
125        Identity(feature) in compute value.
126        """
127        for var in chain(self.domain.variables, self.domain.metas):
128            if (
129                var == feature
130                or isinstance(var.compute_value, Identity)
131                and var.compute_value.variable == feature
132            ):
133                return var
134        return None
135
136    def set_text_features(self, feats: Optional[List[Variable]]) -> None:
137        """
138        Select which meta-attributes to include when mining text.
139
140        Parameters
141        ----------
142        feats
143            List of text features to include. If None infer them.
144        """
145        if feats is not None:
146            feats = copy(feats)  # copy to not edit passed array inplace
147            for i, f in enumerate(feats):
148                if f not in chain(self.domain.variables, self.domain.metas):
149                    # if not exact feature in the domain, it may be renamed
150                    # find identity - renamed feature
151                    id_feat = self._find_identical_feature(f)
152                    if id_feat is not None:
153                        feats[i] = id_feat
154                    else:
155                        raise ValueError('Feature "{}" not found.'.format(f))
156            if len(set(feats)) != len(feats):
157                raise ValueError('Text features must be unique.')
158            self.text_features = feats
159        else:
160            self._infer_text_features()
161        self._tokens = None     # invalidate tokens
162
163    def set_title_variable(
164            self, title_variable: Union[StringVariable, str, None]
165    ) -> None:
166        """
167        Set the title attribute. Only one column can be a title attribute.
168
169        Parameters
170        ----------
171        title_variable
172            Variable that need to be set as a title variable. If it is None,
173            do not set a variable.
174        """
175        for a in self.domain.variables + self.domain.metas:
176            a.attributes.pop("title", None)
177
178        if title_variable and title_variable in self.domain:
179            self.domain[title_variable].attributes["title"] = True
180
181        self._set_unique_titles()
182
183    def _set_unique_titles(self):
184        """
185        Define self._titles variable as a list of titles (a title for each
186        document). It is used to have an unique title for each document. In
187        case when the document have the same title as the other document we
188        put a number beside.
189        """
190        if self.domain is None:
191            return
192        attrs = [attr for attr in
193                 chain(self.domain.variables, self.domain.metas)
194                 if attr.attributes.get('title', False)]
195
196        if attrs:
197            self._titles = np.array(self._unique_titles(
198                self.documents_from_features(attrs)))
199        else:
200            self._titles = np.array([
201                'Document {}'.format(i + 1) for i in range(len(self))])
202
203    @staticmethod
204    def _unique_titles(titles: List[str]) -> List[str]:
205        """
206        Function adds numbers to the non-unique values fo the title.
207
208        Parameters
209        ----------
210        titles
211            List of titles - not necessary unique
212
213        Returns
214        -------
215        List with unique titles.
216        """
217        counts = Counter(titles)
218        cur_appearances = defaultdict(int)
219        new_titles = []
220        for t in titles:
221            if counts[t] > 1:
222                cur_appearances[t] += 1
223                t += f" ({cur_appearances[t]})"
224            new_titles.append(t)
225        return new_titles
226
227    def _infer_text_features(self):
228        """
229        Infer which text features to use. If nothing was provided
230        in the file header, use the first text feature.
231        """
232        include_feats = []
233        first = None
234        for attr in self.domain.metas:
235            if attr.is_string:
236                if first is None:
237                    first = attr
238                if attr.attributes.get('include', 'False') == 'True':
239                    include_feats.append(attr)
240        if len(include_feats) == 0 and first:
241            include_feats.append(first)
242        self.set_text_features(include_feats)
243
244    def extend_corpus(self, metadata, Y):
245        """
246        Append documents to corpus.
247
248        Args:
249            metadata (numpy.ndarray): Meta data
250            Y (numpy.ndarray): Class variables
251        """
252        if np.prod(self.X.shape) != 0:
253            raise ValueError("Extending corpus only works when X is empty"
254                             "while the shape of X is {}".format(self.X.shape))
255
256        self.metas = np.vstack((self.metas, metadata))
257
258        cv = self.domain.class_var
259        for val in set(filter(None, Y)):
260            if val not in cv.values:
261                cv.add_value(val)
262        new_Y = np.array([cv.to_val(i) for i in Y])[:, None]
263        self._Y = np.vstack((self._Y, new_Y))
264
265        self.X = self.W = np.zeros((self.metas.shape[0], 0))
266        Table._init_ids(self)
267
268        self._tokens = None     # invalidate tokens
269        self._set_unique_titles()
270
271    def extend_attributes(
272            self, X, feature_names, feature_values=None, compute_values=None,
273            var_attrs=None, sparse=False, rename_existing=False
274        ):
275        """
276        Append features to corpus. If `feature_values` argument is present,
277        features will be Discrete else Continuous.
278
279        Args:
280            X (numpy.ndarray or scipy.sparse.csr_matrix): Features values to append
281            feature_names (list): List of string containing feature names
282            feature_values (list): A list of possible values for Discrete features.
283            compute_values (list): Compute values for corresponding features.
284            var_attrs (dict): Additional attributes appended to variable.attributes.
285            sparse (bool): Whether the features should be marked as sparse.
286            rename_existing (bool): When true and names are not unique rename
287                exiting features; if false rename new features
288        """
289        def _rename_features(additional_names: List) -> Tuple[List, List, List]:
290            cur_attr = list(self.domain.attributes)
291            cur_class = self.domain.class_var
292            cur_meta = list(self.domain.metas)
293            if rename_existing:
294                current_vars = (
295                        cur_attr + (
296                    [cur_class] if cur_class else []) + cur_meta
297                )
298                current_names = [a.name for a in current_vars]
299                new_names = get_unique_names(
300                    additional_names, current_names, equal_numbers=False
301                )
302                renamed_vars = [
303                    var.renamed(n) for var, n in zip(current_vars, new_names)
304                ]
305                cur_attr = renamed_vars[:len(cur_attr)]
306                cur_class = renamed_vars[len(cur_attr)] if cur_class else None
307                cur_meta = renamed_vars[-len(cur_meta):]
308            return cur_attr, cur_class, cur_meta
309
310        if sp.issparse(self.X) or sp.issparse(X):
311            X = sp.hstack((self.X, X)).tocsr()
312        else:
313            X = np.hstack((self.X, X))
314
315        if compute_values is None:
316            compute_values = [None] * X.shape[1]
317        if feature_values is None:
318            feature_values = [None] * X.shape[1]
319
320        # rename existing variables if required
321        curr_attributes, curr_class_var, curr_metas = _rename_features(
322            feature_names
323        )
324        if not rename_existing:
325            # rename new feature names if required
326            feature_names = get_unique_names(
327                self.domain, feature_names, equal_numbers=False
328            )
329
330        additional_attributes = []
331        for f, values, cv in zip(feature_names, feature_values, compute_values):
332            if values is not None:
333                var = DiscreteVariable(f, values=values, compute_value=cv)
334            else:
335                var = ContinuousVariable(f, compute_value=cv)
336            var.sparse = sparse     # don't pass this to constructor so this works with Orange < 3.8.0
337            if cv is not None:      # set original variable for cv
338                cv.variable = var
339            if isinstance(var_attrs, dict):
340                var.attributes.update(var_attrs)
341            additional_attributes.append(var)
342
343        new_domain = Domain(
344                attributes=curr_attributes + additional_attributes,
345                class_vars=curr_class_var,
346                metas=curr_metas
347        )
348        c = Corpus(
349            new_domain,
350            X,
351            self.Y.copy(),
352            self.metas.copy(),
353            self.W.copy(),
354            copy(self.text_features)
355        )
356        Corpus.retain_preprocessing(self, c)
357        return c
358
359    @property
360    def documents(self):
361        """ Returns a list of strings representing documents — created
362        by joining selected text features. """
363        return self.documents_from_features(self.text_features)
364
365    @property
366    def pp_documents(self):
367        """ Preprocessed documents (transformed). """
368        return self._pp_documents or self.documents
369
370    @pp_documents.setter
371    def pp_documents(self, documents):
372        self._pp_documents = documents
373
374    @property
375    def titles(self):
376        """ Returns a list of titles. """
377        assert self._titles is not None
378        return self._titles
379
380    def documents_from_features(self, feats):
381        """
382        Args:
383            feats (list): A list fo features to join.
384
385        Returns: a list of strings constructed by joining feats.
386        """
387        # create a Table where feats are in metas
388        data = Table.from_table(Domain([], [], [i.name for i in feats],
389                                       source=self.domain), self)
390
391        # When we use only features coming from sparse X data.metas is sparse.
392        # Transform it to dense.
393        if sp.issparse(data.metas):
394            data.metas = data.metas.toarray()
395
396        return [' '.join(f.str_val(val) for f, val in zip(data.domain.metas, row))
397                for row in data.metas]
398
399    def store_tokens(self, tokens, dictionary=None):
400        """
401        Args:
402            tokens (list): List of lists containing tokens.
403        """
404        self._tokens = np.array(tokens, dtype=object)
405        self._dictionary = dictionary or corpora.Dictionary(self.tokens)
406
407    @property
408    def tokens(self):
409        """
410        np.ndarray: A list of lists containing tokens. If tokens are not yet
411        present, run default preprocessor and return tokens.
412        """
413        if self._tokens is None:
414            return self._base_tokens()[0]
415        return self._tokens
416
417    def has_tokens(self):
418        """ Return whether corpus is preprocessed or not. """
419        return self._tokens is not None
420
421    def _base_tokens(self):
422        from orangecontrib.text.preprocess import BASE_TRANSFORMER, \
423            BASE_TOKENIZER, PreprocessorList
424
425        # don't use anything that requires NLTK data to assure async download
426        base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
427                                               BASE_TOKENIZER])
428        corpus = base_preprocessors(self)
429        return corpus.tokens, corpus.dictionary
430
431    @property
432    def dictionary(self):
433        """
434        corpora.Dictionary: A token to id mapper.
435        """
436        if self._dictionary is None:
437            return self._base_tokens()[1]
438        return self._dictionary
439
440    def ngrams_iterator(self, join_with=' ', include_postags=False):
441        if self.pos_tags is None:
442            include_postags = False
443
444        if include_postags:
445            data = zip(self.tokens, self.pos_tags)
446        else:
447            data = self.tokens
448
449        if join_with is None:
450            processor = lambda doc, n: nltk.ngrams(doc, n)
451        elif include_postags:
452            processor = lambda doc, n: (join_with.join(token + '_' + tag for token, tag in ngram)
453                                        for ngram in nltk.ngrams(zip(*doc), n))
454        else:
455            processor = lambda doc, n: (join_with.join(ngram) for ngram in nltk.ngrams(doc, n))
456
457        return (list(chain(*(processor(doc, n)
458                for n in range(self.ngram_range[0], self.ngram_range[1]+1))))
459                for doc in data)
460
461    @property
462    def ngrams_corpus(self):
463        if self._ngrams_corpus is None:
464            return BowVectorizer().transform(self).ngrams_corpus
465        return self._ngrams_corpus
466
467    @ngrams_corpus.setter
468    def ngrams_corpus(self, value):
469        self._ngrams_corpus = value
470
471    @property
472    def ngrams(self):
473        """generator: Ngram representations of documents."""
474        return self.ngrams_iterator(join_with=' ')
475
476    def copy(self):
477        """Return a copy of the table."""
478        c = self.__class__(self.domain, self.X.copy(), self.Y.copy(), self.metas.copy(),
479                           self.W.copy(), copy(self.text_features))
480        # since tokens and dictionary are considered immutable copies are not needed
481        c._tokens = self._tokens
482        c._dictionary = self._dictionary
483        c.ngram_range = self.ngram_range
484        c.pos_tags = self.pos_tags
485        c.name = self.name
486        c.used_preprocessor = self.used_preprocessor
487        c._titles = self._titles
488        c._pp_documents = self._pp_documents
489        return c
490
491    @staticmethod
492    def from_documents(documents, name, attributes=None, class_vars=None, metas=None,
493                       title_indices=None):
494        """
495        Create corpus from documents.
496
497        Args:
498            documents (list): List of documents.
499            name (str): Name of the corpus
500            attributes (list): List of tuples (Variable, getter) for attributes.
501            class_vars (list): List of tuples (Variable, getter) for class vars.
502            metas (list): List of tuples (Variable, getter) for metas.
503            title_indices (list): List of indices into domain corresponding to features which will
504                be used as titles.
505
506        Returns:
507            Corpus.
508        """
509        attributes = attributes or []
510        class_vars = class_vars or []
511        metas = metas or []
512        title_indices = title_indices or []
513
514        domain = Domain(attributes=[attr for attr, _ in attributes],
515                        class_vars=[attr for attr, _ in class_vars],
516                        metas=[attr for attr, _ in metas])
517
518        for ind in title_indices:
519            domain[ind].attributes['title'] = True
520
521        def to_val(attr, val):
522            if isinstance(attr, DiscreteVariable):
523                attr.val_from_str_add(val)
524            return attr.to_val(val)
525
526        if documents:
527            X = np.array([[to_val(attr, func(doc)) for attr, func in attributes]
528                          for doc in documents], dtype=np.float64)
529            Y = np.array([[to_val(attr, func(doc)) for attr, func in class_vars]
530                          for doc in documents], dtype=np.float64)
531            metas = np.array([[to_val(attr, func(doc)) for attr, func in metas]
532                              for doc in documents], dtype=object)
533        else:   # assure shapes match the number of columns
534            X = np.empty((0, len(attributes)))
535            Y = np.empty((0, len(class_vars)))
536            metas = np.empty((0, len(metas)))
537
538        corpus = Corpus(X=X, Y=Y, metas=metas, domain=domain, text_features=[])
539        corpus.name = name
540        return corpus
541
542    def __getitem__(self, key):
543        c = super().__getitem__(key)
544        if isinstance(c, (Corpus, RowInstance)):
545            Corpus.retain_preprocessing(self, c, key)
546        return c
547
548    @classmethod
549    def from_table(cls, domain, source, row_indices=...):
550        t = super().from_table(domain, source, row_indices)
551        c = Corpus(t.domain, t.X, t.Y, t.metas, t.W, ids=t.ids)
552        Corpus.retain_preprocessing(source, c, row_indices)
553        return c
554
555    @classmethod
556    def from_numpy(cls, *args, **kwargs):
557        c = super().from_numpy(*args, **kwargs)
558        c._set_unique_titles()
559        return c
560
561    @classmethod
562    def from_list(cls, domain, rows, weights=None):
563        c = super().from_list(domain, rows, weights)
564        c._set_unique_titles()
565        return c
566
567    @classmethod
568    def from_table_rows(cls, source, row_indices):
569        c = super().from_table_rows(source, row_indices)
570        if hasattr(source, "_titles"):
571            # covering case when from_table_rows called by from_table
572            c._titles = source._titles[row_indices]
573        return c
574
575    @classmethod
576    def from_file(cls, filename):
577        if not os.path.exists(filename):  # check the default location
578            abs_path = os.path.join(get_sample_corpora_dir(), filename)
579            if not abs_path.endswith('.tab'):
580                abs_path += '.tab'
581            if not os.path.exists(abs_path):
582                raise FileNotFoundError('File "{}" not found.'.format(filename))
583            else:
584                filename = abs_path
585
586        table = Table.from_file(filename)
587        corpus = cls(table.domain, table.X, table.Y, table.metas, table.W)
588        return corpus
589
590    @staticmethod
591    def retain_preprocessing(orig, new, key=...):
592        """ Set preprocessing of 'new' object to match the 'orig' object. """
593        if isinstance(orig, Corpus):
594            if isinstance(key, tuple):  # get row selection
595                key = key[0]
596
597            if orig._tokens is not None:  # retain preprocessing
598                if isinstance(key, Integral):
599                    new._tokens = np.array([orig._tokens[key]])
600                    new.pos_tags = None if orig.pos_tags is None else np.array(
601                        [orig.pos_tags[key]])
602                elif isinstance(key, list) or isinstance(key, np.ndarray) \
603                        or isinstance(key, slice) or isinstance(key, range):
604                    new._tokens = orig._tokens[key]
605                    new.pos_tags = None if orig.pos_tags is None else orig.pos_tags[key]
606                elif key is Ellipsis:
607                    new._tokens = orig._tokens
608                    new.pos_tags = orig.pos_tags
609                else:
610                    raise TypeError('Indexing by type {} not supported.'.format(type(key)))
611                new._dictionary = orig._dictionary
612
613            if isinstance(new, Corpus):
614                # _find_identical_feature returns non when feature not found
615                # filter this Nones from list
616                new.text_features = list(filter(None, [
617                    new._find_identical_feature(tf)
618                    for tf in orig.text_features
619                ]))
620            else:
621                new.text_features = [
622                    tf
623                    for tf in orig.text_features
624                    if tf in set(new.domain.metas)
625                ]
626
627            new._titles = orig._titles[key]
628            new.ngram_range = orig.ngram_range
629            new.attributes = orig.attributes
630            new.used_preprocessor = orig.used_preprocessor
631
632    def __eq__(self, other):
633        def arrays_equal(a, b):
634            if sp.issparse(a) != sp.issparse(b):
635                return False
636            elif sp.issparse(a) and sp.issparse(b):
637                return (a != b).nnz == 0
638            else:
639                return np.array_equal(a, b)
640
641        return (self.text_features == other.text_features and
642                self._dictionary == other._dictionary and
643                np.array_equal(self._tokens, other._tokens) and
644                arrays_equal(self.X, other.X) and
645                arrays_equal(self.Y, other.Y) and
646                arrays_equal(self.metas, other.metas) and
647                np.array_equal(self.pos_tags, other.pos_tags) and
648                self.domain == other.domain and
649                self.ngram_range == other.ngram_range)
650