1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
5
6"""Module provides some code scaffolding to simplify use of built dictionary for constructing BoW vectors.
7
8Notes
9-----
10Text corpora usually reside on disk, as text files in one format or another In a common scenario,
11we need to build a dictionary (a `word->integer id` mapping), which is then used to construct sparse bag-of-word vectors
12(= iterable of `(word_id, word_weight)`).
13
14This module provides some code scaffolding to simplify this pipeline. For example, given a corpus where each document
15is a separate line in file on disk, you would override the :meth:`gensim.corpora.textcorpus.TextCorpus.get_texts`
16to read one line=document at a time, process it (lowercase, tokenize, whatever) and yield it as a sequence of words.
17
18Overriding :meth:`gensim.corpora.textcorpus.TextCorpus.get_texts` is enough, you can then initialize the corpus
19with e.g. `MyTextCorpus("mycorpus.txt.bz2")` and it will behave correctly like a corpus of sparse vectors.
20The :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__` method is automatically set up,
21and dictionary is automatically populated with all `word->id` mappings.
22
23The resulting object can be used as input to some of gensim models (:class:`~gensim.models.tfidfmodel.TfidfModel`,
24:class:`~gensim.models.lsimodel.LsiModel`, :class:`~gensim.models.ldamodel.LdaModel`, ...), serialized with any format
25(`Matrix Market <http://math.nist.gov/MatrixMarket/formats.html>`_,
26`SvmLight <http://svmlight.joachims.org/>`_, `Blei's LDA-C format <https://github.com/blei-lab/lda-c>`_, etc).
27
28
29See Also
30--------
31:class:`gensim.test.test_miislita.CorpusMiislita`
32    Good simple example.
33
34"""
35
36
37from __future__ import with_statement
38
39import logging
40import os
41import random
42import re
43import sys
44
45from gensim import interfaces, utils
46from gensim.corpora.dictionary import Dictionary
47from gensim.parsing.preprocessing import STOPWORDS, RE_WHITESPACE
48from gensim.utils import deaccent, simple_tokenize
49
50logger = logging.getLogger(__name__)
51
52
53def remove_stopwords(tokens, stopwords=STOPWORDS):
54    """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`.
55
56    Parameters
57    ----------
58    tokens : iterable of str
59        Sequence of tokens.
60    stopwords : iterable of str, optional
61        Sequence of stopwords
62
63    Returns
64    -------
65    list of str
66        List of tokens without `stopwords`.
67
68    """
69    return [token for token in tokens if token not in stopwords]
70
71
72def remove_short(tokens, minsize=3):
73    """Remove tokens shorter than `minsize` chars.
74
75    Parameters
76    ----------
77    tokens : iterable of str
78        Sequence of tokens.
79    minsize : int, optimal
80        Minimal length of token (include).
81
82    Returns
83    -------
84    list of str
85        List of tokens without short tokens.
86
87    """
88    return [token for token in tokens if len(token) >= minsize]
89
90
91def lower_to_unicode(text, encoding='utf8', errors='strict'):
92    """Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`.
93
94    Parameters
95    ----------
96    text : str
97        Input text.
98    encoding : str, optional
99        Encoding that will be used for conversion.
100    errors : str, optional
101        Error handling behaviour, used as parameter for `unicode` function (python2 only).
102
103    Returns
104    -------
105    str
106        Unicode version of `text`.
107
108    See Also
109    --------
110    :func:`gensim.utils.any2unicode`
111        Convert any string to unicode-string.
112
113    """
114    return utils.to_unicode(text.lower(), encoding, errors)
115
116
117def strip_multiple_whitespaces(s):
118    """Collapse multiple whitespace characters into a single space.
119
120    Parameters
121    ----------
122    s : str
123        Input string
124
125    Returns
126    -------
127    str
128        String with collapsed whitespaces.
129
130    """
131    return RE_WHITESPACE.sub(" ", s)
132
133
134class TextCorpus(interfaces.CorpusABC):
135    """Helper class to simplify the pipeline of getting BoW vectors from plain text.
136
137    Notes
138    -----
139    This is an abstract base class: override the :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` and
140    :meth:`~gensim.corpora.textcorpus.TextCorpus.__len__` methods to match your particular input.
141
142    Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized
143    with a dictionary in `self.dictionary` and will support the :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__`
144    corpus method.  You have a few different ways of utilizing this class via subclassing or by construction with
145    different preprocessing arguments.
146
147    The :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__` method converts the lists of tokens produced by
148    :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` to BoW format using
149    :meth:`gensim.corpora.dictionary.Dictionary.doc2bow`.
150
151    :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` does the following:
152
153    #. Calls :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream` to get a generator over the texts.
154       It yields each document in turn from the underlying text file or files.
155    #. For each document from the stream, calls :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` to produce
156       a list of tokens. If metadata=True, it yields a 2-`tuple` with the document number as the second element.
157
158    Preprocessing consists of 0+ `character_filters`, a `tokenizer`, and 0+ `token_filters`.
159
160    The preprocessing consists of calling each filter in `character_filters` with the document text.
161    Unicode is not guaranteed, and if desired, the first filter should convert to unicode.
162    The output of each character filter should be another string. The output from the final filter is fed
163    to the `tokenizer`, which should split the string into a list of tokens (strings).
164    Afterwards, the list of tokens is fed through each filter in `token_filters`. The final output returned from
165    :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` is the output from the final token filter.
166
167    So to use this class, you can either pass in different preprocessing functions using the
168    `character_filters`, `tokenizer`, and `token_filters` arguments, or you can subclass it.
169
170    If subclassing: override :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream` to take text from different input
171    sources in different formats.
172    Override :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` if you must provide different initial
173    preprocessing, then call the :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` method to apply
174    the normal preprocessing.
175    You can also override :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` in order to tag the documents
176    (token lists) with different metadata.
177
178    The default preprocessing consists of:
179
180    #. :func:`~gensim.corpora.textcorpus.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding)
181    #. :func:`~gensim.utils.deaccent`- deaccent (asciifolding)
182    #. :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces` - collapse multiple whitespaces into a single one
183    #. :func:`~gensim.utils.simple_tokenize` - tokenize by splitting on whitespace
184    #. :func:`~gensim.corpora.textcorpus.remove_short` - remove words less than 3 characters long
185    #. :func:`~gensim.corpora.textcorpus.remove_stopwords` - remove stopwords
186
187    """
188
189    def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None,
190                 tokenizer=None, token_filters=None):
191        """
192
193        Parameters
194        ----------
195        input : str, optional
196            Path to top-level directory (file) to traverse for corpus documents.
197        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
198            If a dictionary is provided, it will not be updated with the given corpus on initialization.
199            If None - new dictionary will be built for the given corpus.
200            If `input` is None, the dictionary will remain uninitialized.
201        metadata : bool, optional
202            If True - yield metadata with each document.
203        character_filters : iterable of callable, optional
204            Each will be applied to the text of each document in order, and should return a single string with
205            the modified text. For Python 2, the original text will not be unicode, so it may be useful to
206            convert to unicode as the first character filter.
207            If None - using :func:`~gensim.corpora.textcorpus.lower_to_unicode`,
208            :func:`~gensim.utils.deaccent` and :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces`.
209        tokenizer : callable, optional
210            Tokenizer for document, if None - using :func:`~gensim.utils.simple_tokenize`.
211        token_filters : iterable of callable, optional
212            Each will be applied to the iterable of tokens in order, and should return another iterable of tokens.
213            These filters can add, remove, or replace tokens, or do nothing at all.
214            If None - using :func:`~gensim.corpora.textcorpus.remove_short` and
215            :func:`~gensim.corpora.textcorpus.remove_stopwords`.
216
217        Examples
218        --------
219        .. sourcecode:: pycon
220
221            >>> from gensim.corpora.textcorpus import TextCorpus
222            >>> from gensim.test.utils import datapath
223            >>> from gensim import utils
224            >>>
225            >>>
226            >>> class CorpusMiislita(TextCorpus):
227            ...     stopwords = set('for a of the and to in on'.split())
228            ...
229            ...     def get_texts(self):
230            ...         for doc in self.getstream():
231            ...             yield [word for word in utils.to_unicode(doc).lower().split() if word not in self.stopwords]
232            ...
233            ...     def __len__(self):
234            ...         self.length = sum(1 for _ in self.get_texts())
235            ...         return self.length
236            >>>
237            >>>
238            >>> corpus = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
239            >>> len(corpus)
240            250
241            >>> document = next(iter(corpus.get_texts()))
242
243        """
244        self.input = input
245        self.metadata = metadata
246
247        self.character_filters = character_filters
248        if self.character_filters is None:
249            self.character_filters = [lower_to_unicode, deaccent, strip_multiple_whitespaces]
250
251        self.tokenizer = tokenizer
252        if self.tokenizer is None:
253            self.tokenizer = simple_tokenize
254
255        self.token_filters = token_filters
256        if self.token_filters is None:
257            self.token_filters = [remove_short, remove_stopwords]
258
259        self.length = None
260        self.dictionary = None
261        self.init_dictionary(dictionary)
262
263    def init_dictionary(self, dictionary):
264        """Initialize/update dictionary.
265
266        Parameters
267        ----------
268        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
269            If a dictionary is provided, it will not be updated with the given corpus on initialization.
270            If None - new dictionary will be built for the given corpus.
271
272        Notes
273        -----
274        If self.input is None - make nothing.
275
276        """
277        self.dictionary = dictionary if dictionary is not None else Dictionary()
278        if self.input is not None:
279            if dictionary is None:
280                logger.info("Initializing dictionary")
281                metadata_setting = self.metadata
282                self.metadata = False
283                self.dictionary.add_documents(self.get_texts())
284                self.metadata = metadata_setting
285            else:
286                logger.info("Input stream provided but dictionary already initialized")
287        else:
288            logger.warning("No input document stream provided; assuming dictionary will be initialized some other way.")
289
290    def __iter__(self):
291        """Iterate over the corpus.
292
293        Yields
294        ------
295        list of (int, int)
296            Document in BoW format (+ metadata if self.metadata).
297
298        """
299        if self.metadata:
300            for text, metadata in self.get_texts():
301                yield self.dictionary.doc2bow(text, allow_update=False), metadata
302        else:
303            for text in self.get_texts():
304                yield self.dictionary.doc2bow(text, allow_update=False)
305
306    def getstream(self):
307        """Generate documents from the underlying plain text collection (of one or more files).
308
309        Yields
310        ------
311        str
312            Document read from plain-text file.
313
314        Notes
315        -----
316        After generator end - initialize self.length attribute.
317
318        """
319        num_texts = 0
320        with utils.file_or_filename(self.input) as f:
321            for line in f:
322                yield line
323                num_texts += 1
324
325        self.length = num_texts
326
327    def preprocess_text(self, text):
328        """Apply `self.character_filters`, `self.tokenizer`, `self.token_filters` to a single text document.
329
330        Parameters
331        ---------
332        text : str
333            Document read from plain-text file.
334
335        Return
336        ------
337        list of str
338            List of tokens extracted from `text`.
339
340        """
341        for character_filter in self.character_filters:
342            text = character_filter(text)
343
344        tokens = self.tokenizer(text)
345        for token_filter in self.token_filters:
346            tokens = token_filter(tokens)
347
348        return tokens
349
350    def step_through_preprocess(self, text):
351        """Apply preprocessor one by one and generate result.
352
353        Warnings
354        --------
355        This is useful for debugging issues with the corpus preprocessing pipeline.
356
357        Parameters
358        ----------
359        text : str
360            Document text read from plain-text file.
361
362        Yields
363        ------
364        (callable, object)
365            Pre-processor, output from pre-processor (based on `text`)
366
367        """
368        for character_filter in self.character_filters:
369            text = character_filter(text)
370            yield (character_filter, text)
371
372        tokens = self.tokenizer(text)
373        yield (self.tokenizer, tokens)
374
375        for token_filter in self.token_filters:
376            yield (token_filter, token_filter(tokens))
377
378    def get_texts(self):
379        """Generate documents from corpus.
380
381        Yields
382        ------
383        list of str
384            Document as sequence of tokens (+ lineno if self.metadata)
385
386        """
387        lines = self.getstream()
388        if self.metadata:
389            for lineno, line in enumerate(lines):
390                yield self.preprocess_text(line), (lineno,)
391        else:
392            for line in lines:
393                yield self.preprocess_text(line)
394
395    def sample_texts(self, n, seed=None, length=None):
396        """Generate `n` random documents from the corpus without replacement.
397
398        Parameters
399        ----------
400        n : int
401            Number of documents we want to sample.
402        seed : int, optional
403            If specified, use it as a seed for local random generator.
404        length : int, optional
405            Value will used as corpus length (because calculate length of corpus can be costly operation).
406            If not specified - will call `__length__`.
407
408        Raises
409        ------
410        ValueError
411            If `n` less than zero or greater than corpus size.
412
413        Notes
414        -----
415        Given the number of remaining documents in a corpus, we need to choose n elements.
416        The probability for the current element to be chosen is `n` / remaining. If we choose it,  we just decrease
417        the `n` and move to the next element.
418
419        Yields
420        ------
421        list of str
422            Sampled document as sequence of tokens.
423
424        """
425        random_generator = random if seed is None else random.Random(seed)
426        if length is None:
427            length = len(self)
428
429        if not n <= length:
430            raise ValueError("n {0:d} is larger/equal than length of corpus {1:d}.".format(n, length))
431        if not 0 <= n:
432            raise ValueError("Negative sample size n {0:d}.".format(n))
433
434        i = 0
435        for i, sample in enumerate(self.getstream()):
436            if i == length:
437                break
438
439            remaining_in_corpus = length - i
440            chance = random_generator.randint(1, remaining_in_corpus)
441            if chance <= n:
442                n -= 1
443                if self.metadata:
444                    yield self.preprocess_text(sample[0]), sample[1]
445                else:
446                    yield self.preprocess_text(sample)
447
448        if n != 0:
449            # This means that length was set to be greater than number of items in corpus
450            # and we were not able to sample enough documents before the stream ended.
451            raise ValueError("length {0:d} greater than number of documents in corpus {1:d}".format(length, i + 1))
452
453    def __len__(self):
454        """Get length of corpus
455
456        Warnings
457        --------
458        If self.length is None - will read all corpus for calculate this attribute through
459        :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream`.
460
461        Returns
462        -------
463        int
464            Length of corpus.
465
466        """
467        if self.length is None:
468            # cache the corpus length
469            self.length = sum(1 for _ in self.getstream())
470        return self.length
471
472
473class TextDirectoryCorpus(TextCorpus):
474    """Read documents recursively from a directory.
475    Each file/line (depends on `lines_are_documents`) is interpreted as a plain text document.
476
477    """
478
479    def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_depth=None,
480                 pattern=None, exclude_pattern=None, lines_are_documents=False, **kwargs):
481        """
482
483        Parameters
484        ----------
485        input : str
486            Path to input file/folder.
487        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
488            If a dictionary is provided, it will not be updated with the given corpus on initialization.
489            If None - new dictionary will be built for the given corpus.
490            If `input` is None, the dictionary will remain uninitialized.
491        metadata : bool, optional
492            If True - yield metadata with each document.
493        min_depth : int, optional
494            Minimum depth in directory tree at which to begin searching for files.
495        max_depth : int, optional
496            Max depth in directory tree at which files will no longer be considered.
497            If None - not limited.
498        pattern : str, optional
499            Regex to use for file name inclusion, all those files *not* matching this pattern will be ignored.
500        exclude_pattern : str, optional
501            Regex to use for file name exclusion, all files matching this pattern will be ignored.
502        lines_are_documents : bool, optional
503            If True - each line is considered a document, otherwise - each file is one document.
504        kwargs: keyword arguments passed through to the `TextCorpus` constructor.
505            See :meth:`gemsim.corpora.textcorpus.TextCorpus.__init__` docstring for more details on these.
506
507        """
508        self._min_depth = min_depth
509        self._max_depth = sys.maxsize if max_depth is None else max_depth
510        self.pattern = pattern
511        self.exclude_pattern = exclude_pattern
512        self.lines_are_documents = lines_are_documents
513        super(TextDirectoryCorpus, self).__init__(input, dictionary, metadata, **kwargs)
514
515    @property
516    def lines_are_documents(self):
517        return self._lines_are_documents
518
519    @lines_are_documents.setter
520    def lines_are_documents(self, lines_are_documents):
521        self._lines_are_documents = lines_are_documents
522        self.length = None
523
524    @property
525    def pattern(self):
526        return self._pattern
527
528    @pattern.setter
529    def pattern(self, pattern):
530        self._pattern = None if pattern is None else re.compile(pattern)
531        self.length = None
532
533    @property
534    def exclude_pattern(self):
535        return self._exclude_pattern
536
537    @exclude_pattern.setter
538    def exclude_pattern(self, pattern):
539        self._exclude_pattern = None if pattern is None else re.compile(pattern)
540        self.length = None
541
542    @property
543    def min_depth(self):
544        return self._min_depth
545
546    @min_depth.setter
547    def min_depth(self, min_depth):
548        self._min_depth = min_depth
549        self.length = None
550
551    @property
552    def max_depth(self):
553        return self._max_depth
554
555    @max_depth.setter
556    def max_depth(self, max_depth):
557        self._max_depth = max_depth
558        self.length = None
559
560    def iter_filepaths(self):
561        """Generate (lazily)  paths to each file in the directory structure within the specified range of depths.
562        If a filename pattern to match was given, further filter to only those filenames that match.
563
564        Yields
565        ------
566        str
567            Path to file
568
569        """
570        for depth, dirpath, dirnames, filenames in walk(self.input):
571            if self.min_depth <= depth <= self.max_depth:
572                if self.pattern is not None:
573                    filenames = (n for n in filenames if self.pattern.match(n) is not None)
574                if self.exclude_pattern is not None:
575                    filenames = (n for n in filenames if self.exclude_pattern.match(n) is None)
576
577                for name in filenames:
578                    yield os.path.join(dirpath, name)
579
580    def getstream(self):
581        """Generate documents from the underlying plain text collection (of one or more files).
582
583        Yields
584        ------
585        str
586            One document (if lines_are_documents - True), otherwise - each file is one document.
587
588        """
589        num_texts = 0
590        for path in self.iter_filepaths():
591            with open(path, 'rt') as f:
592                if self.lines_are_documents:
593                    for line in f:
594                        yield line.strip()
595                        num_texts += 1
596                else:
597                    yield f.read().strip()
598                    num_texts += 1
599
600        self.length = num_texts
601
602    def __len__(self):
603        """Get length of corpus.
604
605        Returns
606        -------
607        int
608            Length of corpus.
609
610        """
611        if self.length is None:
612            self._cache_corpus_length()
613        return self.length
614
615    def _cache_corpus_length(self):
616        """Calculate length of corpus and cache it to `self.length`."""
617        if not self.lines_are_documents:
618            self.length = sum(1 for _ in self.iter_filepaths())
619        else:
620            self.length = sum(1 for _ in self.getstream())
621
622
623def walk(top, topdown=True, onerror=None, followlinks=False, depth=0):
624    """Generate the file names in a directory tree by walking the tree either top-down or bottom-up.
625    For each directory in the tree rooted at directory top (including top itself), it yields a 4-tuple
626    (depth, dirpath, dirnames, filenames).
627
628    Parameters
629    ----------
630    top : str
631        Root directory.
632    topdown : bool, optional
633        If True - you can modify dirnames in-place.
634    onerror : function, optional
635        Some function, will be called with one argument, an OSError instance.
636        It can report the error to continue with the walk, or raise the exception to abort the walk.
637        Note that the filename is available as the filename attribute of the exception object.
638    followlinks : bool, optional
639        If True - visit directories pointed to by symlinks, on systems that support them.
640    depth : int, optional
641        Height of file-tree, don't pass it manually (this used as accumulator for recursion).
642
643    Notes
644    -----
645    This is a mostly copied version of `os.walk` from the Python 2 source code.
646    The only difference is that it returns the depth in the directory tree structure
647    at which each yield is taking place.
648
649    Yields
650    ------
651    (int, str, list of str, list of str)
652        Depth, current path, visited directories, visited non-directories.
653
654    See Also
655    --------
656    `os.walk documentation <https://docs.python.org/2/library/os.html#os.walk>`_
657
658    """
659    islink, join, isdir = os.path.islink, os.path.join, os.path.isdir
660
661    try:
662        # Should be O(1) since it's probably just reading your filesystem journal
663        names = os.listdir(top)
664    except OSError as err:
665        if onerror is not None:
666            onerror(err)
667        return
668
669    dirs, nondirs = [], []
670
671    # O(n) where n = number of files in the directory
672    for name in names:
673        if isdir(join(top, name)):
674            dirs.append(name)
675        else:
676            nondirs.append(name)
677
678    if topdown:
679        yield depth, top, dirs, nondirs
680
681    # Again O(n), where n = number of directories in the directory
682    for name in dirs:
683        new_path = join(top, name)
684        if followlinks or not islink(new_path):
685
686            # Generator so besides the recursive `walk()` call, no additional cost here.
687            for x in walk(new_path, topdown, onerror, followlinks, depth + 1):
688                yield x
689    if not topdown:
690        yield depth, top, dirs, nondirs
691