1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
5# Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com>
6# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
7
8
9"""
10USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]
11
12Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
13bz2-compressed dump of Wikipedia articles, in XML format.
14
15This actually creates three files:
16
17* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
18* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
19  Matrix Matrix format
20* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
21* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump
22
23The output Matrix Market files can then be compressed (e.g., by bzip2) to save
24disk space; gensim's corpus iterators can work with compressed input, too.
25
26`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
27removing tokens that appear in more than 10%% of all documents). Defaults to
28100,000.
29
30If you have the `pattern` package installed, this script will use a fancy
31lemmatization to get a lemma of each token (instead of plain alphabetic
32tokenizer). The package is available at https://github.com/clips/pattern .
33
34Example:
35  python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki
36"""
37
38
39import logging
40import os.path
41import sys
42
43from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
44from gensim.models import TfidfModel
45
46
47# Wiki is first scanned for all distinct word types (~7M). The types that
48# appear in more than 10% of articles are removed and from the rest, the
49# DEFAULT_DICT_SIZE most frequent types are kept.
50DEFAULT_DICT_SIZE = 100000
51
52
53if __name__ == '__main__':
54    program = os.path.basename(sys.argv[0])
55    logger = logging.getLogger(program)
56
57    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
58    logging.root.setLevel(level=logging.INFO)
59    logger.info("running %s", ' '.join(sys.argv))
60
61    # check and process input arguments
62    if len(sys.argv) < 3:
63        print(globals()['__doc__'] % locals())
64        sys.exit(1)
65    inp, outp = sys.argv[1:3]
66
67    if not os.path.isdir(os.path.dirname(outp)):
68        raise SystemExit("Error: The output directory does not exist. Create the directory and try again.")
69
70    if len(sys.argv) > 3:
71        keep_words = int(sys.argv[3])
72    else:
73        keep_words = DEFAULT_DICT_SIZE
74    online = 'online' in program
75    lemmatize = 'lemma' in program
76    debug = 'nodebug' not in program
77
78    if online:
79        dictionary = HashDictionary(id_range=keep_words, debug=debug)
80        dictionary.allow_update = True  # start collecting document frequencies
81        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
82        # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
83        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
84        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
85        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
86        dictionary.save_as_text(outp + '_wordids.txt.bz2')
87        wiki.save(outp + '_corpus.pkl.bz2')
88        dictionary.allow_update = False
89    else:
90        wiki = WikiCorpus(inp, lemmatize=lemmatize)  # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
91        # only keep the most frequent words (out of total ~8.2m unique tokens)
92        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
93        # save dictionary and bag-of-words (term-document frequency matrix)
94        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)  # another ~9h
95        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
96        # load back the id->word mapping directly from file
97        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
98        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
99    del wiki
100
101    # initialize corpus reader and word->id mapping
102    mm = MmCorpus(outp + '_bow.mm')
103
104    # build tfidf, ~50min
105    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
106    tfidf.save(outp + '.tfidf_model')
107
108    # save tfidf vectors in matrix market format
109    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
110    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
111
112    logger.info("finished running %s", program)
113