1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 5 6"""Corpus in `Mallet format <http://mallet.cs.umass.edu/import.php>`_.""" 7 8from __future__ import with_statement 9 10import logging 11 12from gensim import utils 13from gensim.corpora import LowCorpus 14 15 16logger = logging.getLogger(__name__) 17 18 19class MalletCorpus(LowCorpus): 20 """Corpus handles input in `Mallet format <http://mallet.cs.umass.edu/import.php>`_. 21 22 **Format description** 23 24 One file, one instance per line, assume the data is in the following format :: 25 26 [URL] [language] [text of the page...] 27 28 Or, more generally, :: 29 30 [document #1 id] [label] [text of the document...] 31 [document #2 id] [label] [text of the document...] 32 ... 33 [document #N id] [label] [text of the document...] 34 35 Note that language/label is *not* considered in Gensim, used `__unknown__` as default value. 36 37 Examples 38 -------- 39 .. sourcecode:: pycon 40 41 >>> from gensim.test.utils import get_tmpfile, common_texts 42 >>> from gensim.corpora import MalletCorpus 43 >>> from gensim.corpora import Dictionary 44 >>> 45 >>> # Prepare needed data 46 >>> dictionary = Dictionary(common_texts) 47 >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts] 48 >>> 49 >>> # Write corpus in Mallet format to disk 50 >>> output_fname = get_tmpfile("corpus.mallet") 51 >>> MalletCorpus.serialize(output_fname, corpus, dictionary) 52 >>> 53 >>> # Read corpus 54 >>> loaded_corpus = MalletCorpus(output_fname) 55 56 """ 57 def __init__(self, fname, id2word=None, metadata=False): 58 """ 59 60 Parameters 61 ---------- 62 fname : str 63 Path to file in Mallet format. 64 id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional 65 Mapping between word_ids (integers) and words (strings). 66 If not provided, the mapping is constructed directly from `fname`. 67 metadata : bool, optional 68 If True, return additional information ("document id" and "lang" when you call 69 :meth:`~gensim.corpora.malletcorpus.MalletCorpus.line2doc`, 70 :meth:`~gensim.corpora.malletcorpus.MalletCorpus.__iter__` or 71 :meth:`~gensim.corpora.malletcorpus.MalletCorpus.docbyoffset` 72 73 """ 74 self.metadata = metadata 75 LowCorpus.__init__(self, fname, id2word) 76 77 def _calculate_num_docs(self): 78 """Get number of documents. 79 80 Returns 81 ------- 82 int 83 Number of documents in file. 84 85 """ 86 with utils.open(self.fname, 'rb') as fin: 87 result = sum(1 for _ in fin) 88 return result 89 90 def __iter__(self): 91 """Iterate over the corpus. 92 93 Yields 94 ------ 95 list of (int, int) 96 Document in BoW format (+"document_id" and "lang" if metadata=True). 97 98 """ 99 with utils.open(self.fname, 'rb') as f: 100 for line in f: 101 yield self.line2doc(line) 102 103 def line2doc(self, line): 104 """Covert line into document in BoW format. 105 106 Parameters 107 ---------- 108 line : str 109 Line from input file. 110 111 Returns 112 ------- 113 list of (int, int) 114 Document in BoW format (+"document_id" and "lang" if metadata=True). 115 116 Examples 117 -------- 118 .. sourcecode:: pycon 119 120 >>> from gensim.test.utils import datapath 121 >>> from gensim.corpora import MalletCorpus 122 >>> 123 >>> corpus = MalletCorpus(datapath("testcorpus.mallet")) 124 >>> corpus.line2doc("en computer human interface") 125 [(3, 1), (4, 1)] 126 127 """ 128 split_line = utils.to_unicode(line).strip().split(None, 2) 129 docid, doclang = split_line[0], split_line[1] 130 words = split_line[2] if len(split_line) >= 3 else '' 131 132 doc = super(MalletCorpus, self).line2doc(words) 133 134 if self.metadata: 135 return doc, (docid, doclang) 136 else: 137 return doc 138 139 @staticmethod 140 def save_corpus(fname, corpus, id2word=None, metadata=False): 141 """Save a corpus in the Mallet format. 142 143 Warnings 144 -------- 145 This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`, 146 don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead. 147 148 Parameters 149 ---------- 150 fname : str 151 Path to output file. 152 corpus : iterable of iterable of (int, int) 153 Corpus in BoW format. 154 id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional 155 Mapping between word_ids (integers) and words (strings). 156 If not provided, the mapping is constructed directly from `corpus`. 157 metadata : bool, optional 158 If True - ???? 159 160 Return 161 ------ 162 list of int 163 List of offsets in resulting file for each document (in bytes), 164 can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`. 165 166 Notes 167 ----- 168 The document id will be generated by enumerating the corpus. 169 That is, it will range between 0 and number of documents in the corpus. 170 171 Since Mallet has a language field in the format, this defaults to the string '__unknown__'. 172 If the language needs to be saved, post-processing will be required. 173 174 """ 175 if id2word is None: 176 logger.info("no word id mapping provided; initializing from corpus") 177 id2word = utils.dict_from_corpus(corpus) 178 179 logger.info("storing corpus in Mallet format into %s", fname) 180 181 truncated = 0 182 offsets = [] 183 with utils.open(fname, 'wb') as fout: 184 for doc_id, doc in enumerate(corpus): 185 if metadata: 186 doc_id, doc_lang = doc[1] 187 doc = doc[0] 188 else: 189 doc_lang = '__unknown__' 190 191 words = [] 192 for wordid, value in doc: 193 if abs(int(value) - value) > 1e-6: 194 truncated += 1 195 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) 196 offsets.append(fout.tell()) 197 fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) 198 199 if truncated: 200 logger.warning( 201 "Mallet format can only save vectors with integer elements; " 202 "%i float entries were truncated to integer value", truncated 203 ) 204 205 return offsets 206 207 def docbyoffset(self, offset): 208 """Get the document stored in file by `offset` position. 209 210 Parameters 211 ---------- 212 offset : int 213 Offset (in bytes) to begin of document. 214 215 Returns 216 ------- 217 list of (int, int) 218 Document in BoW format (+"document_id" and "lang" if metadata=True). 219 220 Examples 221 -------- 222 .. sourcecode:: pycon 223 224 >>> from gensim.test.utils import datapath 225 >>> from gensim.corpora import MalletCorpus 226 >>> 227 >>> data = MalletCorpus(datapath("testcorpus.mallet")) 228 >>> data.docbyoffset(1) # end of first line 229 [(3, 1), (4, 1)] 230 >>> data.docbyoffset(4) # start of second line 231 [(4, 1)] 232 233 """ 234 with utils.open(self.fname, 'rb') as f: 235 f.seek(offset) 236 return self.line2doc(f.readline()) 237