1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 6"""Module provides some code scaffolding to simplify use of built dictionary for constructing BoW vectors. 7 8Notes 9----- 10Text corpora usually reside on disk, as text files in one format or another In a common scenario, 11we need to build a dictionary (a `word->integer id` mapping), which is then used to construct sparse bag-of-word vectors 12(= iterable of `(word_id, word_weight)`). 13 14This module provides some code scaffolding to simplify this pipeline. For example, given a corpus where each document 15is a separate line in file on disk, you would override the :meth:`gensim.corpora.textcorpus.TextCorpus.get_texts` 16to read one line=document at a time, process it (lowercase, tokenize, whatever) and yield it as a sequence of words. 17 18Overriding :meth:`gensim.corpora.textcorpus.TextCorpus.get_texts` is enough, you can then initialize the corpus 19with e.g. `MyTextCorpus("mycorpus.txt.bz2")` and it will behave correctly like a corpus of sparse vectors. 20The :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__` method is automatically set up, 21and dictionary is automatically populated with all `word->id` mappings. 22 23The resulting object can be used as input to some of gensim models (:class:`~gensim.models.tfidfmodel.TfidfModel`, 24:class:`~gensim.models.lsimodel.LsiModel`, :class:`~gensim.models.ldamodel.LdaModel`, ...), serialized with any format 25(`Matrix Market <http://math.nist.gov/MatrixMarket/formats.html>`_, 26`SvmLight <http://svmlight.joachims.org/>`_, `Blei's LDA-C format <https://github.com/blei-lab/lda-c>`_, etc). 27 28 29See Also 30-------- 31:class:`gensim.test.test_miislita.CorpusMiislita` 32 Good simple example. 33 34""" 35 36 37from __future__ import with_statement 38 39import logging 40import os 41import random 42import re 43import sys 44 45from gensim import interfaces, utils 46from gensim.corpora.dictionary import Dictionary 47from gensim.parsing.preprocessing import STOPWORDS, RE_WHITESPACE 48from gensim.utils import deaccent, simple_tokenize 49 50logger = logging.getLogger(__name__) 51 52 53def remove_stopwords(tokens, stopwords=STOPWORDS): 54 """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`. 55 56 Parameters 57 ---------- 58 tokens : iterable of str 59 Sequence of tokens. 60 stopwords : iterable of str, optional 61 Sequence of stopwords 62 63 Returns 64 ------- 65 list of str 66 List of tokens without `stopwords`. 67 68 """ 69 return [token for token in tokens if token not in stopwords] 70 71 72def remove_short(tokens, minsize=3): 73 """Remove tokens shorter than `minsize` chars. 74 75 Parameters 76 ---------- 77 tokens : iterable of str 78 Sequence of tokens. 79 minsize : int, optimal 80 Minimal length of token (include). 81 82 Returns 83 ------- 84 list of str 85 List of tokens without short tokens. 86 87 """ 88 return [token for token in tokens if len(token) >= minsize] 89 90 91def lower_to_unicode(text, encoding='utf8', errors='strict'): 92 """Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`. 93 94 Parameters 95 ---------- 96 text : str 97 Input text. 98 encoding : str, optional 99 Encoding that will be used for conversion. 100 errors : str, optional 101 Error handling behaviour, used as parameter for `unicode` function (python2 only). 102 103 Returns 104 ------- 105 str 106 Unicode version of `text`. 107 108 See Also 109 -------- 110 :func:`gensim.utils.any2unicode` 111 Convert any string to unicode-string. 112 113 """ 114 return utils.to_unicode(text.lower(), encoding, errors) 115 116 117def strip_multiple_whitespaces(s): 118 """Collapse multiple whitespace characters into a single space. 119 120 Parameters 121 ---------- 122 s : str 123 Input string 124 125 Returns 126 ------- 127 str 128 String with collapsed whitespaces. 129 130 """ 131 return RE_WHITESPACE.sub(" ", s) 132 133 134class TextCorpus(interfaces.CorpusABC): 135 """Helper class to simplify the pipeline of getting BoW vectors from plain text. 136 137 Notes 138 ----- 139 This is an abstract base class: override the :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` and 140 :meth:`~gensim.corpora.textcorpus.TextCorpus.__len__` methods to match your particular input. 141 142 Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized 143 with a dictionary in `self.dictionary` and will support the :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__` 144 corpus method. You have a few different ways of utilizing this class via subclassing or by construction with 145 different preprocessing arguments. 146 147 The :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__` method converts the lists of tokens produced by 148 :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` to BoW format using 149 :meth:`gensim.corpora.dictionary.Dictionary.doc2bow`. 150 151 :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` does the following: 152 153 #. Calls :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream` to get a generator over the texts. 154 It yields each document in turn from the underlying text file or files. 155 #. For each document from the stream, calls :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` to produce 156 a list of tokens. If metadata=True, it yields a 2-`tuple` with the document number as the second element. 157 158 Preprocessing consists of 0+ `character_filters`, a `tokenizer`, and 0+ `token_filters`. 159 160 The preprocessing consists of calling each filter in `character_filters` with the document text. 161 Unicode is not guaranteed, and if desired, the first filter should convert to unicode. 162 The output of each character filter should be another string. The output from the final filter is fed 163 to the `tokenizer`, which should split the string into a list of tokens (strings). 164 Afterwards, the list of tokens is fed through each filter in `token_filters`. The final output returned from 165 :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` is the output from the final token filter. 166 167 So to use this class, you can either pass in different preprocessing functions using the 168 `character_filters`, `tokenizer`, and `token_filters` arguments, or you can subclass it. 169 170 If subclassing: override :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream` to take text from different input 171 sources in different formats. 172 Override :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` if you must provide different initial 173 preprocessing, then call the :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` method to apply 174 the normal preprocessing. 175 You can also override :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` in order to tag the documents 176 (token lists) with different metadata. 177 178 The default preprocessing consists of: 179 180 #. :func:`~gensim.corpora.textcorpus.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding) 181 #. :func:`~gensim.utils.deaccent`- deaccent (asciifolding) 182 #. :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces` - collapse multiple whitespaces into a single one 183 #. :func:`~gensim.utils.simple_tokenize` - tokenize by splitting on whitespace 184 #. :func:`~gensim.corpora.textcorpus.remove_short` - remove words less than 3 characters long 185 #. :func:`~gensim.corpora.textcorpus.remove_stopwords` - remove stopwords 186 187 """ 188 189 def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, 190 tokenizer=None, token_filters=None): 191 """ 192 193 Parameters 194 ---------- 195 input : str, optional 196 Path to top-level directory (file) to traverse for corpus documents. 197 dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional 198 If a dictionary is provided, it will not be updated with the given corpus on initialization. 199 If None - new dictionary will be built for the given corpus. 200 If `input` is None, the dictionary will remain uninitialized. 201 metadata : bool, optional 202 If True - yield metadata with each document. 203 character_filters : iterable of callable, optional 204 Each will be applied to the text of each document in order, and should return a single string with 205 the modified text. For Python 2, the original text will not be unicode, so it may be useful to 206 convert to unicode as the first character filter. 207 If None - using :func:`~gensim.corpora.textcorpus.lower_to_unicode`, 208 :func:`~gensim.utils.deaccent` and :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces`. 209 tokenizer : callable, optional 210 Tokenizer for document, if None - using :func:`~gensim.utils.simple_tokenize`. 211 token_filters : iterable of callable, optional 212 Each will be applied to the iterable of tokens in order, and should return another iterable of tokens. 213 These filters can add, remove, or replace tokens, or do nothing at all. 214 If None - using :func:`~gensim.corpora.textcorpus.remove_short` and 215 :func:`~gensim.corpora.textcorpus.remove_stopwords`. 216 217 Examples 218 -------- 219 .. sourcecode:: pycon 220 221 >>> from gensim.corpora.textcorpus import TextCorpus 222 >>> from gensim.test.utils import datapath 223 >>> from gensim import utils 224 >>> 225 >>> 226 >>> class CorpusMiislita(TextCorpus): 227 ... stopwords = set('for a of the and to in on'.split()) 228 ... 229 ... def get_texts(self): 230 ... for doc in self.getstream(): 231 ... yield [word for word in utils.to_unicode(doc).lower().split() if word not in self.stopwords] 232 ... 233 ... def __len__(self): 234 ... self.length = sum(1 for _ in self.get_texts()) 235 ... return self.length 236 >>> 237 >>> 238 >>> corpus = CorpusMiislita(datapath('head500.noblanks.cor.bz2')) 239 >>> len(corpus) 240 250 241 >>> document = next(iter(corpus.get_texts())) 242 243 """ 244 self.input = input 245 self.metadata = metadata 246 247 self.character_filters = character_filters 248 if self.character_filters is None: 249 self.character_filters = [lower_to_unicode, deaccent, strip_multiple_whitespaces] 250 251 self.tokenizer = tokenizer 252 if self.tokenizer is None: 253 self.tokenizer = simple_tokenize 254 255 self.token_filters = token_filters 256 if self.token_filters is None: 257 self.token_filters = [remove_short, remove_stopwords] 258 259 self.length = None 260 self.dictionary = None 261 self.init_dictionary(dictionary) 262 263 def init_dictionary(self, dictionary): 264 """Initialize/update dictionary. 265 266 Parameters 267 ---------- 268 dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional 269 If a dictionary is provided, it will not be updated with the given corpus on initialization. 270 If None - new dictionary will be built for the given corpus. 271 272 Notes 273 ----- 274 If self.input is None - make nothing. 275 276 """ 277 self.dictionary = dictionary if dictionary is not None else Dictionary() 278 if self.input is not None: 279 if dictionary is None: 280 logger.info("Initializing dictionary") 281 metadata_setting = self.metadata 282 self.metadata = False 283 self.dictionary.add_documents(self.get_texts()) 284 self.metadata = metadata_setting 285 else: 286 logger.info("Input stream provided but dictionary already initialized") 287 else: 288 logger.warning("No input document stream provided; assuming dictionary will be initialized some other way.") 289 290 def __iter__(self): 291 """Iterate over the corpus. 292 293 Yields 294 ------ 295 list of (int, int) 296 Document in BoW format (+ metadata if self.metadata). 297 298 """ 299 if self.metadata: 300 for text, metadata in self.get_texts(): 301 yield self.dictionary.doc2bow(text, allow_update=False), metadata 302 else: 303 for text in self.get_texts(): 304 yield self.dictionary.doc2bow(text, allow_update=False) 305 306 def getstream(self): 307 """Generate documents from the underlying plain text collection (of one or more files). 308 309 Yields 310 ------ 311 str 312 Document read from plain-text file. 313 314 Notes 315 ----- 316 After generator end - initialize self.length attribute. 317 318 """ 319 num_texts = 0 320 with utils.file_or_filename(self.input) as f: 321 for line in f: 322 yield line 323 num_texts += 1 324 325 self.length = num_texts 326 327 def preprocess_text(self, text): 328 """Apply `self.character_filters`, `self.tokenizer`, `self.token_filters` to a single text document. 329 330 Parameters 331 --------- 332 text : str 333 Document read from plain-text file. 334 335 Return 336 ------ 337 list of str 338 List of tokens extracted from `text`. 339 340 """ 341 for character_filter in self.character_filters: 342 text = character_filter(text) 343 344 tokens = self.tokenizer(text) 345 for token_filter in self.token_filters: 346 tokens = token_filter(tokens) 347 348 return tokens 349 350 def step_through_preprocess(self, text): 351 """Apply preprocessor one by one and generate result. 352 353 Warnings 354 -------- 355 This is useful for debugging issues with the corpus preprocessing pipeline. 356 357 Parameters 358 ---------- 359 text : str 360 Document text read from plain-text file. 361 362 Yields 363 ------ 364 (callable, object) 365 Pre-processor, output from pre-processor (based on `text`) 366 367 """ 368 for character_filter in self.character_filters: 369 text = character_filter(text) 370 yield (character_filter, text) 371 372 tokens = self.tokenizer(text) 373 yield (self.tokenizer, tokens) 374 375 for token_filter in self.token_filters: 376 yield (token_filter, token_filter(tokens)) 377 378 def get_texts(self): 379 """Generate documents from corpus. 380 381 Yields 382 ------ 383 list of str 384 Document as sequence of tokens (+ lineno if self.metadata) 385 386 """ 387 lines = self.getstream() 388 if self.metadata: 389 for lineno, line in enumerate(lines): 390 yield self.preprocess_text(line), (lineno,) 391 else: 392 for line in lines: 393 yield self.preprocess_text(line) 394 395 def sample_texts(self, n, seed=None, length=None): 396 """Generate `n` random documents from the corpus without replacement. 397 398 Parameters 399 ---------- 400 n : int 401 Number of documents we want to sample. 402 seed : int, optional 403 If specified, use it as a seed for local random generator. 404 length : int, optional 405 Value will used as corpus length (because calculate length of corpus can be costly operation). 406 If not specified - will call `__length__`. 407 408 Raises 409 ------ 410 ValueError 411 If `n` less than zero or greater than corpus size. 412 413 Notes 414 ----- 415 Given the number of remaining documents in a corpus, we need to choose n elements. 416 The probability for the current element to be chosen is `n` / remaining. If we choose it, we just decrease 417 the `n` and move to the next element. 418 419 Yields 420 ------ 421 list of str 422 Sampled document as sequence of tokens. 423 424 """ 425 random_generator = random if seed is None else random.Random(seed) 426 if length is None: 427 length = len(self) 428 429 if not n <= length: 430 raise ValueError("n {0:d} is larger/equal than length of corpus {1:d}.".format(n, length)) 431 if not 0 <= n: 432 raise ValueError("Negative sample size n {0:d}.".format(n)) 433 434 i = 0 435 for i, sample in enumerate(self.getstream()): 436 if i == length: 437 break 438 439 remaining_in_corpus = length - i 440 chance = random_generator.randint(1, remaining_in_corpus) 441 if chance <= n: 442 n -= 1 443 if self.metadata: 444 yield self.preprocess_text(sample[0]), sample[1] 445 else: 446 yield self.preprocess_text(sample) 447 448 if n != 0: 449 # This means that length was set to be greater than number of items in corpus 450 # and we were not able to sample enough documents before the stream ended. 451 raise ValueError("length {0:d} greater than number of documents in corpus {1:d}".format(length, i + 1)) 452 453 def __len__(self): 454 """Get length of corpus 455 456 Warnings 457 -------- 458 If self.length is None - will read all corpus for calculate this attribute through 459 :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream`. 460 461 Returns 462 ------- 463 int 464 Length of corpus. 465 466 """ 467 if self.length is None: 468 # cache the corpus length 469 self.length = sum(1 for _ in self.getstream()) 470 return self.length 471 472 473class TextDirectoryCorpus(TextCorpus): 474 """Read documents recursively from a directory. 475 Each file/line (depends on `lines_are_documents`) is interpreted as a plain text document. 476 477 """ 478 479 def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_depth=None, 480 pattern=None, exclude_pattern=None, lines_are_documents=False, **kwargs): 481 """ 482 483 Parameters 484 ---------- 485 input : str 486 Path to input file/folder. 487 dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional 488 If a dictionary is provided, it will not be updated with the given corpus on initialization. 489 If None - new dictionary will be built for the given corpus. 490 If `input` is None, the dictionary will remain uninitialized. 491 metadata : bool, optional 492 If True - yield metadata with each document. 493 min_depth : int, optional 494 Minimum depth in directory tree at which to begin searching for files. 495 max_depth : int, optional 496 Max depth in directory tree at which files will no longer be considered. 497 If None - not limited. 498 pattern : str, optional 499 Regex to use for file name inclusion, all those files *not* matching this pattern will be ignored. 500 exclude_pattern : str, optional 501 Regex to use for file name exclusion, all files matching this pattern will be ignored. 502 lines_are_documents : bool, optional 503 If True - each line is considered a document, otherwise - each file is one document. 504 kwargs: keyword arguments passed through to the `TextCorpus` constructor. 505 See :meth:`gemsim.corpora.textcorpus.TextCorpus.__init__` docstring for more details on these. 506 507 """ 508 self._min_depth = min_depth 509 self._max_depth = sys.maxsize if max_depth is None else max_depth 510 self.pattern = pattern 511 self.exclude_pattern = exclude_pattern 512 self.lines_are_documents = lines_are_documents 513 super(TextDirectoryCorpus, self).__init__(input, dictionary, metadata, **kwargs) 514 515 @property 516 def lines_are_documents(self): 517 return self._lines_are_documents 518 519 @lines_are_documents.setter 520 def lines_are_documents(self, lines_are_documents): 521 self._lines_are_documents = lines_are_documents 522 self.length = None 523 524 @property 525 def pattern(self): 526 return self._pattern 527 528 @pattern.setter 529 def pattern(self, pattern): 530 self._pattern = None if pattern is None else re.compile(pattern) 531 self.length = None 532 533 @property 534 def exclude_pattern(self): 535 return self._exclude_pattern 536 537 @exclude_pattern.setter 538 def exclude_pattern(self, pattern): 539 self._exclude_pattern = None if pattern is None else re.compile(pattern) 540 self.length = None 541 542 @property 543 def min_depth(self): 544 return self._min_depth 545 546 @min_depth.setter 547 def min_depth(self, min_depth): 548 self._min_depth = min_depth 549 self.length = None 550 551 @property 552 def max_depth(self): 553 return self._max_depth 554 555 @max_depth.setter 556 def max_depth(self, max_depth): 557 self._max_depth = max_depth 558 self.length = None 559 560 def iter_filepaths(self): 561 """Generate (lazily) paths to each file in the directory structure within the specified range of depths. 562 If a filename pattern to match was given, further filter to only those filenames that match. 563 564 Yields 565 ------ 566 str 567 Path to file 568 569 """ 570 for depth, dirpath, dirnames, filenames in walk(self.input): 571 if self.min_depth <= depth <= self.max_depth: 572 if self.pattern is not None: 573 filenames = (n for n in filenames if self.pattern.match(n) is not None) 574 if self.exclude_pattern is not None: 575 filenames = (n for n in filenames if self.exclude_pattern.match(n) is None) 576 577 for name in filenames: 578 yield os.path.join(dirpath, name) 579 580 def getstream(self): 581 """Generate documents from the underlying plain text collection (of one or more files). 582 583 Yields 584 ------ 585 str 586 One document (if lines_are_documents - True), otherwise - each file is one document. 587 588 """ 589 num_texts = 0 590 for path in self.iter_filepaths(): 591 with open(path, 'rt') as f: 592 if self.lines_are_documents: 593 for line in f: 594 yield line.strip() 595 num_texts += 1 596 else: 597 yield f.read().strip() 598 num_texts += 1 599 600 self.length = num_texts 601 602 def __len__(self): 603 """Get length of corpus. 604 605 Returns 606 ------- 607 int 608 Length of corpus. 609 610 """ 611 if self.length is None: 612 self._cache_corpus_length() 613 return self.length 614 615 def _cache_corpus_length(self): 616 """Calculate length of corpus and cache it to `self.length`.""" 617 if not self.lines_are_documents: 618 self.length = sum(1 for _ in self.iter_filepaths()) 619 else: 620 self.length = sum(1 for _ in self.getstream()) 621 622 623def walk(top, topdown=True, onerror=None, followlinks=False, depth=0): 624 """Generate the file names in a directory tree by walking the tree either top-down or bottom-up. 625 For each directory in the tree rooted at directory top (including top itself), it yields a 4-tuple 626 (depth, dirpath, dirnames, filenames). 627 628 Parameters 629 ---------- 630 top : str 631 Root directory. 632 topdown : bool, optional 633 If True - you can modify dirnames in-place. 634 onerror : function, optional 635 Some function, will be called with one argument, an OSError instance. 636 It can report the error to continue with the walk, or raise the exception to abort the walk. 637 Note that the filename is available as the filename attribute of the exception object. 638 followlinks : bool, optional 639 If True - visit directories pointed to by symlinks, on systems that support them. 640 depth : int, optional 641 Height of file-tree, don't pass it manually (this used as accumulator for recursion). 642 643 Notes 644 ----- 645 This is a mostly copied version of `os.walk` from the Python 2 source code. 646 The only difference is that it returns the depth in the directory tree structure 647 at which each yield is taking place. 648 649 Yields 650 ------ 651 (int, str, list of str, list of str) 652 Depth, current path, visited directories, visited non-directories. 653 654 See Also 655 -------- 656 `os.walk documentation <https://docs.python.org/2/library/os.html#os.walk>`_ 657 658 """ 659 islink, join, isdir = os.path.islink, os.path.join, os.path.isdir 660 661 try: 662 # Should be O(1) since it's probably just reading your filesystem journal 663 names = os.listdir(top) 664 except OSError as err: 665 if onerror is not None: 666 onerror(err) 667 return 668 669 dirs, nondirs = [], [] 670 671 # O(n) where n = number of files in the directory 672 for name in names: 673 if isdir(join(top, name)): 674 dirs.append(name) 675 else: 676 nondirs.append(name) 677 678 if topdown: 679 yield depth, top, dirs, nondirs 680 681 # Again O(n), where n = number of directories in the directory 682 for name in dirs: 683 new_path = join(top, name) 684 if followlinks or not islink(new_path): 685 686 # Generator so besides the recursive `walk()` call, no additional cost here. 687 for x in walk(new_path, topdown, onerror, followlinks, depth + 1): 688 yield x 689 if not topdown: 690 yield depth, top, dirs, nondirs 691