1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
5
6"""Corpus in `Mallet format <http://mallet.cs.umass.edu/import.php>`_."""
7
8from __future__ import with_statement
9
10import logging
11
12from gensim import utils
13from gensim.corpora import LowCorpus
14
15
16logger = logging.getLogger(__name__)
17
18
19class MalletCorpus(LowCorpus):
20    """Corpus handles input in `Mallet format <http://mallet.cs.umass.edu/import.php>`_.
21
22    **Format description**
23
24    One file, one instance per line, assume the data is in the following format ::
25
26        [URL] [language] [text of the page...]
27
28    Or, more generally, ::
29
30        [document #1 id] [label] [text of the document...]
31        [document #2 id] [label] [text of the document...]
32        ...
33        [document #N id] [label] [text of the document...]
34
35    Note that language/label is *not* considered in Gensim, used `__unknown__` as default value.
36
37    Examples
38    --------
39    .. sourcecode:: pycon
40
41        >>> from gensim.test.utils import get_tmpfile, common_texts
42        >>> from gensim.corpora import MalletCorpus
43        >>> from gensim.corpora import Dictionary
44        >>>
45        >>> # Prepare needed data
46        >>> dictionary = Dictionary(common_texts)
47        >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts]
48        >>>
49        >>> # Write corpus in Mallet format to disk
50        >>> output_fname = get_tmpfile("corpus.mallet")
51        >>> MalletCorpus.serialize(output_fname, corpus, dictionary)
52        >>>
53        >>> # Read corpus
54        >>> loaded_corpus = MalletCorpus(output_fname)
55
56    """
57    def __init__(self, fname, id2word=None, metadata=False):
58        """
59
60        Parameters
61        ----------
62        fname : str
63            Path to file in Mallet format.
64        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
65            Mapping between word_ids (integers) and words (strings).
66            If not provided, the mapping is constructed directly from `fname`.
67        metadata : bool, optional
68            If True, return additional information ("document id" and "lang" when you call
69            :meth:`~gensim.corpora.malletcorpus.MalletCorpus.line2doc`,
70            :meth:`~gensim.corpora.malletcorpus.MalletCorpus.__iter__` or
71            :meth:`~gensim.corpora.malletcorpus.MalletCorpus.docbyoffset`
72
73       """
74        self.metadata = metadata
75        LowCorpus.__init__(self, fname, id2word)
76
77    def _calculate_num_docs(self):
78        """Get number of documents.
79
80        Returns
81        -------
82        int
83            Number of documents in file.
84
85        """
86        with utils.open(self.fname, 'rb') as fin:
87            result = sum(1 for _ in fin)
88        return result
89
90    def __iter__(self):
91        """Iterate over the corpus.
92
93        Yields
94        ------
95        list of (int, int)
96            Document in BoW format (+"document_id" and "lang" if metadata=True).
97
98        """
99        with utils.open(self.fname, 'rb') as f:
100            for line in f:
101                yield self.line2doc(line)
102
103    def line2doc(self, line):
104        """Covert line into document in BoW format.
105
106        Parameters
107        ----------
108        line : str
109            Line from input file.
110
111        Returns
112        -------
113        list of (int, int)
114            Document in BoW format (+"document_id" and "lang" if metadata=True).
115
116        Examples
117        --------
118        .. sourcecode:: pycon
119
120            >>> from gensim.test.utils import datapath
121            >>> from gensim.corpora import MalletCorpus
122            >>>
123            >>> corpus = MalletCorpus(datapath("testcorpus.mallet"))
124            >>> corpus.line2doc("en computer human interface")
125            [(3, 1), (4, 1)]
126
127        """
128        split_line = utils.to_unicode(line).strip().split(None, 2)
129        docid, doclang = split_line[0], split_line[1]
130        words = split_line[2] if len(split_line) >= 3 else ''
131
132        doc = super(MalletCorpus, self).line2doc(words)
133
134        if self.metadata:
135            return doc, (docid, doclang)
136        else:
137            return doc
138
139    @staticmethod
140    def save_corpus(fname, corpus, id2word=None, metadata=False):
141        """Save a corpus in the Mallet format.
142
143        Warnings
144        --------
145        This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`,
146        don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead.
147
148        Parameters
149        ----------
150        fname : str
151            Path to output file.
152        corpus : iterable of iterable of (int, int)
153            Corpus in BoW format.
154        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
155            Mapping between word_ids (integers) and words (strings).
156            If not provided, the mapping is constructed directly from `corpus`.
157        metadata : bool, optional
158            If True - ????
159
160        Return
161        ------
162        list of int
163            List of offsets in resulting file for each document (in bytes),
164            can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`.
165
166        Notes
167        -----
168        The document id will be generated by enumerating the corpus.
169        That is, it will range between 0 and number of documents in the corpus.
170
171        Since Mallet has a language field in the format, this defaults to the string '__unknown__'.
172        If the language needs to be saved, post-processing will be required.
173
174        """
175        if id2word is None:
176            logger.info("no word id mapping provided; initializing from corpus")
177            id2word = utils.dict_from_corpus(corpus)
178
179        logger.info("storing corpus in Mallet format into %s", fname)
180
181        truncated = 0
182        offsets = []
183        with utils.open(fname, 'wb') as fout:
184            for doc_id, doc in enumerate(corpus):
185                if metadata:
186                    doc_id, doc_lang = doc[1]
187                    doc = doc[0]
188                else:
189                    doc_lang = '__unknown__'
190
191                words = []
192                for wordid, value in doc:
193                    if abs(int(value) - value) > 1e-6:
194                        truncated += 1
195                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
196                offsets.append(fout.tell())
197                fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words))))
198
199        if truncated:
200            logger.warning(
201                "Mallet format can only save vectors with integer elements; "
202                "%i float entries were truncated to integer value", truncated
203            )
204
205        return offsets
206
207    def docbyoffset(self, offset):
208        """Get the document stored in file by `offset` position.
209
210        Parameters
211        ----------
212        offset : int
213            Offset (in bytes) to begin of document.
214
215        Returns
216        -------
217        list of (int, int)
218            Document in BoW format (+"document_id" and "lang" if metadata=True).
219
220        Examples
221        --------
222        .. sourcecode:: pycon
223
224            >>> from gensim.test.utils import datapath
225            >>> from gensim.corpora import MalletCorpus
226            >>>
227            >>> data = MalletCorpus(datapath("testcorpus.mallet"))
228            >>> data.docbyoffset(1)  # end of first line
229            [(3, 1), (4, 1)]
230            >>> data.docbyoffset(4)  # start of second line
231            [(4, 1)]
232
233        """
234        with utils.open(self.fname, 'rb') as f:
235            f.seek(offset)
236            return self.line2doc(f.readline())
237