1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz> 5# Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com> 6# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 7 8 9""" 10USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] 11 12Convert articles from a Wikipedia dump to (sparse) vectors. The input is a 13bz2-compressed dump of Wikipedia articles, in XML format. 14 15This actually creates three files: 16 17* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids 18* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in 19 Matrix Matrix format 20* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation 21* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump 22 23The output Matrix Market files can then be compressed (e.g., by bzip2) to save 24disk space; gensim's corpus iterators can work with compressed input, too. 25 26`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after 27removing tokens that appear in more than 10%% of all documents). Defaults to 28100,000. 29 30If you have the `pattern` package installed, this script will use a fancy 31lemmatization to get a lemma of each token (instead of plain alphabetic 32tokenizer). The package is available at https://github.com/clips/pattern . 33 34Example: 35 python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki 36""" 37 38 39import logging 40import os.path 41import sys 42 43from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus 44from gensim.models import TfidfModel 45 46 47# Wiki is first scanned for all distinct word types (~7M). The types that 48# appear in more than 10% of articles are removed and from the rest, the 49# DEFAULT_DICT_SIZE most frequent types are kept. 50DEFAULT_DICT_SIZE = 100000 51 52 53if __name__ == '__main__': 54 program = os.path.basename(sys.argv[0]) 55 logger = logging.getLogger(program) 56 57 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') 58 logging.root.setLevel(level=logging.INFO) 59 logger.info("running %s", ' '.join(sys.argv)) 60 61 # check and process input arguments 62 if len(sys.argv) < 3: 63 print(globals()['__doc__'] % locals()) 64 sys.exit(1) 65 inp, outp = sys.argv[1:3] 66 67 if not os.path.isdir(os.path.dirname(outp)): 68 raise SystemExit("Error: The output directory does not exist. Create the directory and try again.") 69 70 if len(sys.argv) > 3: 71 keep_words = int(sys.argv[3]) 72 else: 73 keep_words = DEFAULT_DICT_SIZE 74 online = 'online' in program 75 lemmatize = 'lemma' in program 76 debug = 'nodebug' not in program 77 78 if online: 79 dictionary = HashDictionary(id_range=keep_words, debug=debug) 80 dictionary.allow_update = True # start collecting document frequencies 81 wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) 82 # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) 83 MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) 84 # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` 85 dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 86 dictionary.save_as_text(outp + '_wordids.txt.bz2') 87 wiki.save(outp + '_corpus.pkl.bz2') 88 dictionary.allow_update = False 89 else: 90 wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) 91 # only keep the most frequent words (out of total ~8.2m unique tokens) 92 wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) 93 # save dictionary and bag-of-words (term-document frequency matrix) 94 MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h 95 wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') 96 # load back the id->word mapping directly from file 97 # this seems to save more memory, compared to keeping the wiki.dictionary object from above 98 dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') 99 del wiki 100 101 # initialize corpus reader and word->id mapping 102 mm = MmCorpus(outp + '_bow.mm') 103 104 # build tfidf, ~50min 105 tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) 106 tfidf.save(outp + '.tfidf_model') 107 108 # save tfidf vectors in matrix market format 109 # ~4h; result file is 15GB! bzip2'ed down to 4.5GB 110 MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) 111 112 logger.info("finished running %s", program) 113