1#!/usr/bin/env python 2"""This sample uses the Recoll Python API to index a directory 3containing mbox files. This is not particularly useful as Recoll 4itself can do this better (e.g. this script does not process 5attachments), but it shows the use of most of the Recoll interface 6features, except 'parent_udi' (we do not create a 'self' document to 7act as the parent).""" 8from __future__ import print_function 9 10import sys 11import glob 12import os 13import stat 14import mailbox 15import email.header 16import email.utils 17 18try: 19 from recoll import recoll 20except: 21 import recoll 22 23# EDIT 24# Change this for some directory with mbox files, such as a 25# Thunderbird/Icedove mail storage directory. 26mbdir = os.path.expanduser("~/mail") 27#mbdir = os.path.expanduser("~/.icedove/n8n19644.default/Mail/Local Folders/") 28 29# EDIT 30# Change this to wherever you want your recoll data to live. Create 31# the directory with a (possibly empty) recoll.conf in it before first 32# running the script 33rclconf = os.path.expanduser("~/.recoll-extern") 34 35# Utility: extract text for named header 36def header_value(msg, nm, to_utf = False): 37 value = msg.get(nm) 38 if value == None: 39 return "" 40 #value = value.replace("\n", "") 41 #value = value.replace("\r", "") 42 parts = email.header.decode_header(value) 43 univalue = u"" 44 for part in parts: 45 try: 46 if part[1] != None: 47 univalue += part[0].decode(part[1]) + u" " 48 else: 49 if isinstance(part[0], bytes): 50 univalue += part[0].decode("cp1252") + u" " 51 else: 52 univalue += part[0] + u" " 53 except Exception as err: 54 print("Failed decoding header: %s" % err, file=sys.stderr) 55 pass 56 if to_utf: 57 return univalue.encode('utf-8') 58 else: 59 return univalue 60 61# Utility: extract text parts from body 62def extract_text(msg): 63 """Extract and decode all text/plain parts from the message""" 64 text = u"" 65 # We only output the headers for previewing, else they're already 66 # output/indexed as fields. 67 if "RECOLL_FILTER_FORPREVIEW" in os.environ and \ 68 os.environ["RECOLL_FILTER_FORPREVIEW"] == "yes": 69 text += u"From: " + header_value(msg, "From") + u"\n" 70 text += u"To: " + header_value(msg, "To") + u"\n" 71 text += u"Subject: " + header_value(msg, "Subject") + u"\n" 72 # text += u"Content-Type: text/plain; charset=UTF-8\n" 73 #text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n" 74 text += u"\n" 75 for part in msg.walk(): 76 if part.is_multipart(): 77 pass 78 else: 79 ct = part.get_content_type() 80 if ct.lower() == "text/plain": 81 charset = part.get_content_charset("cp1252") 82 try: 83 ntxt = part.get_payload(None, True).decode(charset) 84 text += ntxt 85 except Exception as err: 86 print("Failed decoding payload: %s" % err, 87 file=sys.stderr) 88 pass 89 return text 90 91 92 93class mbox_indexer: 94 """The indexer classs. An object is created for indexing one mbox folder""" 95 def __init__(self, db, mbfile): 96 """Initialize for writable db recoll.Db object and mbfile mbox 97 file. We retrieve the the file size and mtime.""" 98 self.db = db 99 self.mbfile = mbfile 100 stdata = os.stat(mbfile) 101 self.fmtime = stdata[stat.ST_MTIME] 102 self.fbytes = stdata[stat.ST_SIZE] 103 self.msgnum = 1 104 105 def sig(self): 106 """Create update verification value for mbox file: 107 modification time concatenated with size should cover most 108 cases""" 109 return str(self.fmtime) + ":" + str(self.fbytes) 110 111 def udi(self, msgnum): 112 """Create unique document identifier for message. This should 113 be shorter than 150 bytes, which we optimistically don't check 114 here, as we just concatenate the mbox file name and message 115 number""" 116 return self.mbfile + ":" + str(msgnum) 117 118 def index(self): 119 if not self.db.needUpdate(self.udi(1), self.sig()): 120 print("Index is up to date for %s"%self.mbfile, file=sys.stderr); 121 return None 122 mb = mailbox.mbox(self.mbfile) 123 for msg in mb.values(): 124 print("Indexing message %d" % self.msgnum, file=sys.stderr); 125 self.index_message(msg) 126 self.msgnum += 1 127 128 def getdata(self, ipath): 129 """Implements the 'fetch' data access interface (called at 130 query time from the command line).""" 131 #print("mbox::getdata: ipath: %s" % ipath, file=sys.stderr) 132 imsgnum = int(ipath) 133 mb = mailbox.mbox(self.mbfile) 134 msgnum = 0; 135 for msg in mb.values(): 136 msgnum += 1 137 if msgnum == imsgnum: 138 return extract_text(msg) 139 return "" 140 141 def index_message(self, msg): 142 doc = recoll.Doc() 143 144 # Misc standard recoll fields 145 doc.author = header_value(msg, "From") 146 doc.recipient = header_value(msg, "To") + " " + header_value(msg, "Cc") 147 dte = header_value(msg, "Date") 148 tm = email.utils.parsedate_tz(dte) 149 if tm == None: 150 doc.mtime = str(self.fmtime) 151 else: 152 doc.mtime = str(email.utils.mktime_tz(tm)) 153 doc.title = header_value(msg, "Subject") 154 doc.fbytes = str(self.fbytes) 155 156 # Custom field 157 doc.myfield = "some value" 158 159 # Main document text and MIME type 160 doc.text = extract_text(msg) 161 doc.dbytes = str(len(doc.text.encode('UTF-8'))) 162 doc.mimetype = "text/plain" 163 164 # Store data for later "up to date" checks 165 doc.sig = self.sig() 166 167 # The rclbes field is the link between the index data and this 168 # script when used at query time 169 doc.rclbes = "MBOX" 170 171 # These get stored inside the index, and returned at query 172 # time, but the main identifier is the condensed 'udi' 173 doc.url = "file://" + self.mbfile 174 doc.ipath = str(self.msgnum) 175 # The udi is the unique document identifier, later used if we 176 # want to e.g. delete the document index data (and other ops). 177 udi = self.udi(self.msgnum) 178 179 self.db.addOrUpdate(udi, doc) 180 181# Index a directory containing mbox files 182def index_mboxdir(dir): 183 db = recoll.connect(confdir=rclconf, writable=1) 184 entries = glob.glob(dir + "/*") 185 for ent in entries: 186 if '.' in os.path.basename(ent): 187 # skip .log etc. our mboxes have no exts 188 continue 189 if not os.path.isfile(ent): 190 continue 191 print("Processing %s"%ent) 192 mbidx = mbox_indexer(db, ent) 193 mbidx.index() 194 db.purge() 195 196usage_string='''Usage: 197rclmbox.py 198 Index the directory (the path is hard-coded inside the script) 199rclmbox.py [fetch|makesig] udi url ipath 200 fetch subdoc data or make signature (query time) 201''' 202def usage(): 203 print("%s" % usage_string, file=sys.stderr) 204 sys.exit(1) 205 206if len(sys.argv) == 1: 207 index_mboxdir(mbdir) 208else: 209 # cmd [fetch|makesig] udi url ipath 210 if len(sys.argv) != 5: 211 usage() 212 cmd = sys.argv[1] 213 udi = sys.argv[2] 214 url = sys.argv[3] 215 ipath = sys.argv[4] 216 217 mbfile = url.replace('file://', '') 218 # no need for a db for getdata or makesig. 219 mbidx = mbox_indexer(None, mbfile) 220 221 if cmd == 'fetch': 222 print("%s"%mbidx.getdata(ipath).encode('UTF-8'), end="") 223 elif cmd == 'makesig': 224 print(mbidx.sig(), end="") 225 else: 226 usage() 227 228sys.exit(0) 229