1# Copyright (C) 2018 Radim Rehurek <radimrehurek@seznam.cz>
2# cython: embedsignature=True
3
4"""Reader for corpus in the Matrix Market format."""
5import logging
6
7cimport cython
8from libc.stdio cimport sscanf
9
10from gensim import utils
11
12logger = logging.getLogger(__name__)
13
14
15cdef class MmReader():
16    """Matrix market file reader (fast Cython version), used internally in :class:`~gensim.corpora.mmcorpus.MmCorpus`.
17
18    Wrap a term-document matrix on disk (in matrix-market format), and present it
19    as an object which supports iteration over the rows (~documents).
20
21    Attributes
22    ----------
23    num_docs : int
24        Number of documents in the market matrix file.
25    num_terms : int
26        Number of terms.
27    num_nnz : int
28        Number of non-zero terms.
29
30    Notes
31    -----
32    Note that the file is read into memory one document at a time, not the whole matrix at once
33    (unlike e.g. `scipy.io.mmread` and other implementations).
34    This allows us to process corpora which are larger than the available RAM.
35
36    """
37    cdef public input
38    cdef public bint transposed
39    cdef public long long num_docs, num_terms, num_nnz
40
41    def __init__(self, input, transposed=True):
42        """
43
44        Parameters
45        ----------
46        input : {str, file-like object}
47            Path to the input file in MM format or a file-like object that supports `seek()`
48            (e.g. smart_open objects).
49
50        transposed : bool, optional
51            Do lines represent `doc_id, term_id, value`, instead of `term_id, doc_id, value`?
52
53        """
54        logger.info("initializing cython corpus reader from %s", input)
55        self.input, self.transposed = input, transposed
56        with utils.open_file(self.input) as lines:
57            try:
58                header = utils.to_unicode(next(lines)).strip()
59                if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
60                    raise ValueError(
61                        "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
62                        (self.input, header)
63                    )
64            except StopIteration:
65                pass
66
67            self.num_docs = self.num_terms = self.num_nnz = 0
68            for lineno, line in enumerate(lines):
69                line = utils.to_unicode(line)
70                if not line.startswith('%'):
71                    self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
72                    if not self.transposed:
73                        self.num_docs, self.num_terms = self.num_terms, self.num_docs
74                    break
75
76        logger.info(
77            "accepted corpus with %i documents, %i features, %i non-zero entries",
78            self.num_docs, self.num_terms, self.num_nnz
79        )
80
81    def __len__(self):
82        """Get the corpus size: total number of documents."""
83        return self.num_docs
84
85    def __str__(self):
86        return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
87                (self.num_docs, self.num_terms, self.num_nnz))
88
89    def skip_headers(self, input_file):
90        """Skip file headers that appear before the first document.
91
92        Parameters
93        ----------
94        input_file : iterable of str
95            Iterable taken from file in MM format.
96
97        """
98        for line in input_file:
99            if line.startswith(b'%'):
100                continue
101            break
102
103    def __iter__(self):
104        """Iterate through all documents in the corpus.
105
106        Notes
107        ------
108        Note that the total number of vectors returned is always equal to the number of rows specified
109        in the header: empty documents are inserted and yielded where appropriate, even if they are not explicitly
110        stored in the Matrix Market file.
111
112        Yields
113        ------
114        (int, list of (int, number))
115            Document id and document in sparse bag-of-words format.
116
117        """
118        cdef long long docid, termid, previd
119        cdef double val = 0
120
121        with utils.file_or_filename(self.input) as lines:
122            self.skip_headers(lines)
123
124            previd = -1
125            for line in lines:
126
127                if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3):
128                    raise ValueError("unable to parse line: {}".format(line))
129
130                if not self.transposed:
131                    termid, docid = docid, termid
132
133                # -1 because matrix market indexes are 1-based => convert to 0-based
134                docid -= 1
135                termid -= 1
136
137                assert previd <= docid, "matrix columns must come in ascending order"
138                if docid != previd:
139                    # change of document: return the document read so far (its id is prevId)
140                    if previd >= 0:
141                        yield previd, document  # noqa:F821
142
143                    # return implicit (empty) documents between previous id and new id
144                    # too, to keep consistent document numbering and corpus length
145                    for previd in range(previd + 1, docid):
146                        yield previd, []
147
148                    # from now on start adding fields to a new document, with a new id
149                    previd = docid
150                    document = []
151
152                document.append((termid, val,))  # add another field to the current document
153
154        # handle the last document, as a special case
155        if previd >= 0:
156            yield previd, document
157
158        # return empty documents between the last explicit document and the number
159        # of documents as specified in the header
160        for previd in range(previd + 1, self.num_docs):
161            yield previd, []
162
163    def docbyoffset(self, offset):
164        """Get the document at file offset `offset` (in bytes).
165
166        Parameters
167        ----------
168        offset : int
169            File offset, in bytes, of the desired document.
170
171        Returns
172        ------
173        list of (int, str)
174            Document in sparse bag-of-words format.
175
176        """
177        # empty documents are not stored explicitly in MM format, so the index marks
178        # them with a special offset, -1.
179        cdef long long docid, termid, previd
180        cdef double val
181
182        if offset == -1:
183            return []
184        if isinstance(self.input, str):
185            fin, close_fin = utils.open(self.input, 'rb'), True
186        else:
187            fin, close_fin = self.input, False
188
189        fin.seek(offset)  # works for gzip/bz2 input, too
190        previd, document = -1, []
191        for line in fin:
192            if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3):
193                raise ValueError("unable to parse line: {}".format(line))
194
195            if not self.transposed:
196                termid, docid = docid, termid
197
198            # -1 because matrix market indexes are 1-based => convert to 0-based
199            docid -= 1
200            termid -= 1
201
202            assert previd <= docid, "matrix columns must come in ascending order"
203            if docid != previd:
204                if previd >= 0:
205                    break
206                previd = docid
207
208            document.append((termid, val,))  # add another field to the current document
209
210        if close_fin:
211            fin.close()
212        return document
213