1#!/usr/bin/env python
2"""This sample uses the Recoll Python API to index a directory
3containing mbox files. This is not particularly useful as Recoll
4itself can do this better (e.g. this script does not process
5attachments), but it shows the use of most of the Recoll interface
6features, except 'parent_udi' (we do not create a 'self' document to
7act as the parent)."""
8from __future__ import print_function
9
10import sys
11import glob
12import os
13import stat
14import mailbox
15import email.header
16import email.utils
17
18try:
19    from recoll import recoll
20except:
21    import recoll
22
23# EDIT
24# Change this for some directory with mbox files, such as a
25# Thunderbird/Icedove mail storage directory.
26mbdir = os.path.expanduser("~/mail")
27#mbdir = os.path.expanduser("~/.icedove/n8n19644.default/Mail/Local Folders/")
28
29# EDIT
30# Change this to wherever you want your recoll data to live. Create
31# the directory with a (possibly empty) recoll.conf in it before first
32# running the script
33rclconf = os.path.expanduser("~/.recoll-extern")
34
35# Utility: extract text for named header
36def header_value(msg, nm, to_utf = False):
37    value = msg.get(nm)
38    if value == None:
39        return ""
40    #value = value.replace("\n", "")
41    #value = value.replace("\r", "")
42    parts = email.header.decode_header(value)
43    univalue = u""
44    for part in parts:
45        try:
46            if part[1] != None:
47                univalue += part[0].decode(part[1]) + u" "
48            else:
49                if isinstance(part[0], bytes):
50                    univalue += part[0].decode("cp1252") + u" "
51                else:
52                    univalue += part[0] + u" "
53        except Exception as err:
54            print("Failed decoding header: %s" % err, file=sys.stderr)
55            pass
56    if to_utf:
57        return univalue.encode('utf-8')
58    else:
59        return univalue
60
61# Utility: extract text parts from body
62def extract_text(msg):
63    """Extract and decode all text/plain parts from the message"""
64    text = u""
65    # We only output the headers for previewing, else they're already
66    # output/indexed as fields.
67    if "RECOLL_FILTER_FORPREVIEW" in os.environ and \
68           os.environ["RECOLL_FILTER_FORPREVIEW"] == "yes":
69        text += u"From: " + header_value(msg, "From") + u"\n"
70        text += u"To: " + header_value(msg, "To") + u"\n"
71        text += u"Subject: " + header_value(msg, "Subject") + u"\n"
72        # text += u"Content-Type: text/plain; charset=UTF-8\n"
73        #text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n"
74        text += u"\n"
75    for part in msg.walk():
76        if part.is_multipart():
77            pass
78        else:
79            ct = part.get_content_type()
80            if ct.lower() == "text/plain":
81                charset = part.get_content_charset("cp1252")
82                try:
83                    ntxt = part.get_payload(None, True).decode(charset)
84                    text += ntxt
85                except Exception as err:
86                    print("Failed decoding payload: %s" % err,
87                          file=sys.stderr)
88                    pass
89    return text
90
91
92
93class mbox_indexer:
94    """The indexer classs. An object is created for indexing one mbox folder"""
95    def __init__(self, db, mbfile):
96        """Initialize for writable db recoll.Db object and mbfile mbox
97        file. We retrieve the the file size and mtime."""
98        self.db = db
99        self.mbfile = mbfile
100        stdata = os.stat(mbfile)
101        self.fmtime = stdata[stat.ST_MTIME]
102        self.fbytes = stdata[stat.ST_SIZE]
103        self.msgnum = 1
104
105    def sig(self):
106        """Create update verification value for mbox file:
107        modification time concatenated with size should cover most
108        cases"""
109        return str(self.fmtime) + ":" + str(self.fbytes)
110
111    def udi(self, msgnum):
112        """Create unique document identifier for message. This should
113        be shorter than 150 bytes, which we optimistically don't check
114        here, as we just concatenate the mbox file name and message
115        number"""
116        return self.mbfile + ":" + str(msgnum)
117
118    def index(self):
119        if not self.db.needUpdate(self.udi(1), self.sig()):
120            print("Index is up to date for %s"%self.mbfile, file=sys.stderr);
121            return None
122        mb = mailbox.mbox(self.mbfile)
123        for msg in mb.values():
124            print("Indexing message %d" % self.msgnum, file=sys.stderr);
125            self.index_message(msg)
126            self.msgnum += 1
127
128    def getdata(self, ipath):
129        """Implements the 'fetch' data access interface (called at
130        query time from the command line)."""
131        #print("mbox::getdata: ipath: %s" % ipath, file=sys.stderr)
132        imsgnum = int(ipath)
133        mb = mailbox.mbox(self.mbfile)
134        msgnum = 0;
135        for msg in mb.values():
136            msgnum += 1
137            if msgnum == imsgnum:
138                return extract_text(msg)
139        return ""
140
141    def index_message(self, msg):
142        doc = recoll.Doc()
143
144        # Misc standard recoll fields
145        doc.author = header_value(msg, "From")
146        doc.recipient = header_value(msg, "To") + " " + header_value(msg, "Cc")
147        dte = header_value(msg, "Date")
148        tm = email.utils.parsedate_tz(dte)
149        if tm == None:
150            doc.mtime = str(self.fmtime)
151        else:
152            doc.mtime = str(email.utils.mktime_tz(tm))
153        doc.title = header_value(msg, "Subject")
154        doc.fbytes = str(self.fbytes)
155
156        # Custom field
157        doc.myfield = "some value"
158
159        # Main document text and MIME type
160        doc.text = extract_text(msg)
161        doc.dbytes = str(len(doc.text.encode('UTF-8')))
162        doc.mimetype = "text/plain"
163
164        # Store data for later "up to date" checks
165        doc.sig = self.sig()
166
167        # The rclbes field is the link between the index data and this
168        # script when used at query time
169        doc.rclbes = "MBOX"
170
171        # These get stored inside the index, and returned at query
172        # time, but the main identifier is the condensed 'udi'
173        doc.url = "file://" + self.mbfile
174        doc.ipath = str(self.msgnum)
175        # The udi is the unique document identifier, later used if we
176        # want to e.g. delete the document index data (and other ops).
177        udi = self.udi(self.msgnum)
178
179        self.db.addOrUpdate(udi, doc)
180
181# Index a directory containing mbox files
182def index_mboxdir(dir):
183    db = recoll.connect(confdir=rclconf, writable=1)
184    entries = glob.glob(dir + "/*")
185    for ent in entries:
186        if '.' in os.path.basename(ent):
187            # skip .log etc. our mboxes have no exts
188            continue
189        if not os.path.isfile(ent):
190            continue
191        print("Processing %s"%ent)
192        mbidx = mbox_indexer(db, ent)
193        mbidx.index()
194    db.purge()
195
196usage_string='''Usage:
197rclmbox.py
198    Index the directory (the path is hard-coded inside the script)
199rclmbox.py [fetch|makesig] udi url ipath
200    fetch subdoc data or make signature (query time)
201'''
202def usage():
203    print("%s" % usage_string, file=sys.stderr)
204    sys.exit(1)
205
206if len(sys.argv) == 1:
207    index_mboxdir(mbdir)
208else:
209    # cmd [fetch|makesig] udi url ipath
210    if len(sys.argv) != 5:
211        usage()
212    cmd = sys.argv[1]
213    udi = sys.argv[2]
214    url = sys.argv[3]
215    ipath = sys.argv[4]
216
217    mbfile = url.replace('file://', '')
218    # no need for a db for getdata or makesig.
219    mbidx = mbox_indexer(None, mbfile)
220
221    if cmd == 'fetch':
222        print("%s"%mbidx.getdata(ipath).encode('UTF-8'), end="")
223    elif cmd == 'makesig':
224        print(mbidx.sig(), end="")
225    else:
226        usage()
227
228sys.exit(0)
229