1# Copyright (C) 2018 Radim Rehurek <radimrehurek@seznam.cz> 2# cython: embedsignature=True 3 4"""Reader for corpus in the Matrix Market format.""" 5import logging 6 7cimport cython 8from libc.stdio cimport sscanf 9 10from gensim import utils 11 12logger = logging.getLogger(__name__) 13 14 15cdef class MmReader(): 16 """Matrix market file reader (fast Cython version), used internally in :class:`~gensim.corpora.mmcorpus.MmCorpus`. 17 18 Wrap a term-document matrix on disk (in matrix-market format), and present it 19 as an object which supports iteration over the rows (~documents). 20 21 Attributes 22 ---------- 23 num_docs : int 24 Number of documents in the market matrix file. 25 num_terms : int 26 Number of terms. 27 num_nnz : int 28 Number of non-zero terms. 29 30 Notes 31 ----- 32 Note that the file is read into memory one document at a time, not the whole matrix at once 33 (unlike e.g. `scipy.io.mmread` and other implementations). 34 This allows us to process corpora which are larger than the available RAM. 35 36 """ 37 cdef public input 38 cdef public bint transposed 39 cdef public long long num_docs, num_terms, num_nnz 40 41 def __init__(self, input, transposed=True): 42 """ 43 44 Parameters 45 ---------- 46 input : {str, file-like object} 47 Path to the input file in MM format or a file-like object that supports `seek()` 48 (e.g. smart_open objects). 49 50 transposed : bool, optional 51 Do lines represent `doc_id, term_id, value`, instead of `term_id, doc_id, value`? 52 53 """ 54 logger.info("initializing cython corpus reader from %s", input) 55 self.input, self.transposed = input, transposed 56 with utils.open_file(self.input) as lines: 57 try: 58 header = utils.to_unicode(next(lines)).strip() 59 if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): 60 raise ValueError( 61 "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % 62 (self.input, header) 63 ) 64 except StopIteration: 65 pass 66 67 self.num_docs = self.num_terms = self.num_nnz = 0 68 for lineno, line in enumerate(lines): 69 line = utils.to_unicode(line) 70 if not line.startswith('%'): 71 self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) 72 if not self.transposed: 73 self.num_docs, self.num_terms = self.num_terms, self.num_docs 74 break 75 76 logger.info( 77 "accepted corpus with %i documents, %i features, %i non-zero entries", 78 self.num_docs, self.num_terms, self.num_nnz 79 ) 80 81 def __len__(self): 82 """Get the corpus size: total number of documents.""" 83 return self.num_docs 84 85 def __str__(self): 86 return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % 87 (self.num_docs, self.num_terms, self.num_nnz)) 88 89 def skip_headers(self, input_file): 90 """Skip file headers that appear before the first document. 91 92 Parameters 93 ---------- 94 input_file : iterable of str 95 Iterable taken from file in MM format. 96 97 """ 98 for line in input_file: 99 if line.startswith(b'%'): 100 continue 101 break 102 103 def __iter__(self): 104 """Iterate through all documents in the corpus. 105 106 Notes 107 ------ 108 Note that the total number of vectors returned is always equal to the number of rows specified 109 in the header: empty documents are inserted and yielded where appropriate, even if they are not explicitly 110 stored in the Matrix Market file. 111 112 Yields 113 ------ 114 (int, list of (int, number)) 115 Document id and document in sparse bag-of-words format. 116 117 """ 118 cdef long long docid, termid, previd 119 cdef double val = 0 120 121 with utils.file_or_filename(self.input) as lines: 122 self.skip_headers(lines) 123 124 previd = -1 125 for line in lines: 126 127 if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3): 128 raise ValueError("unable to parse line: {}".format(line)) 129 130 if not self.transposed: 131 termid, docid = docid, termid 132 133 # -1 because matrix market indexes are 1-based => convert to 0-based 134 docid -= 1 135 termid -= 1 136 137 assert previd <= docid, "matrix columns must come in ascending order" 138 if docid != previd: 139 # change of document: return the document read so far (its id is prevId) 140 if previd >= 0: 141 yield previd, document # noqa:F821 142 143 # return implicit (empty) documents between previous id and new id 144 # too, to keep consistent document numbering and corpus length 145 for previd in range(previd + 1, docid): 146 yield previd, [] 147 148 # from now on start adding fields to a new document, with a new id 149 previd = docid 150 document = [] 151 152 document.append((termid, val,)) # add another field to the current document 153 154 # handle the last document, as a special case 155 if previd >= 0: 156 yield previd, document 157 158 # return empty documents between the last explicit document and the number 159 # of documents as specified in the header 160 for previd in range(previd + 1, self.num_docs): 161 yield previd, [] 162 163 def docbyoffset(self, offset): 164 """Get the document at file offset `offset` (in bytes). 165 166 Parameters 167 ---------- 168 offset : int 169 File offset, in bytes, of the desired document. 170 171 Returns 172 ------ 173 list of (int, str) 174 Document in sparse bag-of-words format. 175 176 """ 177 # empty documents are not stored explicitly in MM format, so the index marks 178 # them with a special offset, -1. 179 cdef long long docid, termid, previd 180 cdef double val 181 182 if offset == -1: 183 return [] 184 if isinstance(self.input, str): 185 fin, close_fin = utils.open(self.input, 'rb'), True 186 else: 187 fin, close_fin = self.input, False 188 189 fin.seek(offset) # works for gzip/bz2 input, too 190 previd, document = -1, [] 191 for line in fin: 192 if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3): 193 raise ValueError("unable to parse line: {}".format(line)) 194 195 if not self.transposed: 196 termid, docid = docid, termid 197 198 # -1 because matrix market indexes are 1-based => convert to 0-based 199 docid -= 1 200 termid -= 1 201 202 assert previd <= docid, "matrix columns must come in ascending order" 203 if docid != previd: 204 if previd >= 0: 205 break 206 previd = docid 207 208 document.append((termid, val,)) # add another field to the current document 209 210 if close_fin: 211 fin.close() 212 return document 213