1# Copyright (C) 1998-2018 by the Free Software Foundation, Inc.
2#
3# This program is free software; you can redistribute it and/or
4# modify it under the terms of the GNU General Public License
5# as published by the Free Software Foundation; either version 2
6# of the License, or (at your option) any later version.
7#
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU General Public License for more details.
12#
13# You should have received a copy of the GNU General Public License
14# along with this program; if not, write to the Free Software
15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
16# USA.
17
18"""HyperArch: Pipermail archiving for Mailman
19
20     - The Dragon De Monsyne <dragondm@integral.org>
21
22   TODO:
23     - Should be able to force all HTML to be regenerated next time the
24       archive is run, in case a template is changed.
25     - Run a command to generate tarball of html archives for downloading
26       (probably in the 'update_dirty_archives' method).
27"""
28
29from __future__ import nested_scopes
30
31import sys
32import re
33import errno
34import urllib
35import time
36import os
37import types
38import HyperDatabase
39import pipermail
40import weakref
41import binascii
42
43from email.Header import decode_header, make_header
44from email.Errors import HeaderParseError
45from email.Charset import Charset
46
47from Mailman import mm_cfg
48from Mailman import Utils
49from Mailman import Errors
50from Mailman import LockFile
51from Mailman import MailList
52from Mailman import i18n
53from Mailman.SafeDict import SafeDict
54from Mailman.Logging.Syslog import syslog
55from Mailman.Mailbox import ArchiverMailbox
56
57# Set up i18n.  Assume the current language has already been set in the caller.
58_ = i18n._
59C_ = i18n.C_
60
61gzip = None
62if mm_cfg.GZIP_ARCHIVE_TXT_FILES:
63    try:
64        import gzip
65    except ImportError:
66        pass
67
68EMPTYSTRING = ''
69NL = '\n'
70
71# MacOSX has a default stack size that is too small for deeply recursive
72# regular expressions.  We see this as crashes in the Python test suite when
73# running test_re.py and test_sre.py.  The fix is to set the stack limit to
74# 2048; the general recommendation is to do in the shell before running the
75# test suite.  But that's inconvenient for a daemon like the qrunner.
76#
77# AFAIK, this problem only affects the archiver, so we're adding this work
78# around to this file (it'll get imported by the bundled pipermail or by the
79# bin/arch script.  We also only do this on darwin, a.k.a. MacOSX.
80if sys.platform == 'darwin':
81    try:
82        import resource
83    except ImportError:
84        pass
85    else:
86        soft, hard = resource.getrlimit(resource.RLIMIT_STACK)
87        newsoft = min(hard, max(soft, 1024*2048))
88        resource.setrlimit(resource.RLIMIT_STACK, (newsoft, hard))
89
90
91try:
92    True, False
93except NameError:
94    True = 1
95    False = 0
96
97
98
99def html_quote(s, lang=None):
100    repls = ( ('&', '&amp;'),
101              ("<", '&lt;'),
102              (">", '&gt;'),
103              ('"', '&quot;'))
104    for thing, repl in repls:
105        s = s.replace(thing, repl)
106    return Utils.uncanonstr(s, lang)
107
108
109def url_quote(s):
110    return urllib.quote(s)
111
112
113def null_to_space(s):
114    return s.replace('\000', ' ')
115
116
117def sizeof(filename, lang):
118    try:
119        size = os.path.getsize(filename)
120    except OSError, e:
121        # ENOENT can happen if the .mbox file was moved away or deleted, and
122        # an explicit mbox file name was given to bin/arch.
123        if e.errno <> errno.ENOENT: raise
124        return _('size not available')
125    if size < 1000:
126        # Avoid i18n side-effects
127        otrans = i18n.get_translation()
128        try:
129            i18n.set_language(lang)
130            out = _(' %(size)i bytes ')
131        finally:
132            i18n.set_translation(otrans)
133        return out
134    elif size < 1000000:
135        return ' %d KB ' % (size / 1000)
136    # GB?? :-)
137    return ' %d MB ' % (size / 1000000)
138
139
140html_charset = '<META http-equiv="Content-Type" ' \
141               'content="text/html; charset=%s">'
142
143def CGIescape(arg, lang=None):
144    if isinstance(arg, types.UnicodeType):
145        s = Utils.websafe(arg)
146    else:
147        s = Utils.websafe(str(arg))
148    return Utils.uncanonstr(s.replace('"', '&quot;'), lang)
149
150# Parenthesized human name
151paren_name_pat = re.compile(r'([(].*[)])')
152
153# Subject lines preceded with 'Re:'
154REpat = re.compile( r"\s*RE\s*(\[\d+\]\s*)?:\s*", re.IGNORECASE)
155
156# E-mail addresses and URLs in text
157emailpat = re.compile(r'([-+,.\w]+@[-+.\w]+)')
158
159#  Argh!  This pattern is buggy, and will choke on URLs with GET parameters.
160# MAS: Given that people are not constrained in how they write URIs in plain
161# text, it is not possible to have a single regexp to reliably match them.
162# The regexp below is intended to match straightforward cases.  Even humans
163# can't reliably tell whether various punctuation at the end of a URI is part
164# of the URI or not.
165urlpat = re.compile(r'([a-z]+://.*?)(?:_\s|_$|$|[]})>\'"\s])', re.IGNORECASE)
166
167# Blank lines
168blankpat = re.compile(r'^\s*$')
169
170# Starting <html> directive
171htmlpat = re.compile(r'^\s*<HTML>\s*$', re.IGNORECASE)
172# Ending </html> directive
173nohtmlpat = re.compile(r'^\s*</HTML>\s*$', re.IGNORECASE)
174# Match quoted text
175quotedpat = re.compile(r'^([>|:]|&gt;)+')
176
177
178
179# Like Utils.maketext() but with caching to improve performance.
180#
181# _templatefilepathcache is used to associate a (templatefile, lang, listname)
182# key with the file system path to a template file.  This path is the one that
183# the Utils.findtext() function has computed is the one to match the values in
184# the key tuple.
185#
186# _templatecache associate a file system path as key with the text
187# returned after processing the contents of that file by Utils.findtext()
188#
189# We keep two caches to reduce the amount of template text kept in memory,
190# since the _templatefilepathcache is a many->one mapping and _templatecache
191# is a one->one mapping.  Imagine 1000 lists all using the same default
192# English template.
193
194_templatefilepathcache = {}
195_templatecache = {}
196
197def quick_maketext(templatefile, dict=None, lang=None, mlist=None):
198    if mlist is None:
199        listname = ''
200    else:
201        listname = mlist._internal_name
202    if lang is None:
203        if mlist is None:
204            lang = mm_cfg.DEFAULT_SERVER_LANGUAGE
205        else:
206            lang = mlist.preferred_language
207    cachekey = (templatefile, lang, listname)
208    filepath =  _templatefilepathcache.get(cachekey)
209    if filepath:
210        template = _templatecache.get(filepath)
211    if filepath is None or template is None:
212        # Use the basic maketext, with defaults to get the raw template
213        template, filepath = Utils.findtext(templatefile, lang=lang,
214                                            raw=True, mlist=mlist)
215        _templatefilepathcache[cachekey] = filepath
216        _templatecache[filepath] = template
217    # Copied from Utils.maketext()
218    text = template
219    if dict is not None:
220        try:
221            sdict = SafeDict(dict)
222            try:
223                text = sdict.interpolate(template)
224            except UnicodeError:
225                # Try again after coercing the template to unicode
226                utemplate = unicode(template,
227                                    Utils.GetCharSet(lang),
228                                    'replace')
229                text = sdict.interpolate(utemplate)
230        except (TypeError, ValueError), e:
231            # The template is really screwed up
232            syslog('error', 'broken template: %s\n%s', filepath, e)
233    # Make sure the text is in the given character set, or html-ify any bogus
234    # characters.
235    return Utils.uncanonstr(text, lang)
236
237
238
239# Note: I'm overriding most, if not all of the pipermail Article class
240#       here -ddm
241# The Article class encapsulates a single posting.  The attributes are:
242#
243#  sequence : Sequence number, unique for each article in a set of archives
244#  subject  : Subject
245#  datestr  : The posting date, in human-readable format
246#  date     : The posting date, in purely numeric format
247#  fromdate : The posting date, in `unixfrom' format
248#  headers  : Any other headers of interest
249#  author   : The author's name (and possibly organization)
250#  email    : The author's e-mail address
251#  msgid    : A unique message ID
252#  in_reply_to : If !="", this is the msgid of the article being replied to
253#  references: A (possibly empty) list of msgid's of earlier articles in
254#              the thread
255#  body     : A list of strings making up the message body
256
257class Article(pipermail.Article):
258    __super_init = pipermail.Article.__init__
259    __super_set_date = pipermail.Article._set_date
260
261    _last_article_time = time.time()
262
263    def __init__(self, message=None, sequence=0, keepHeaders=[],
264                       lang=mm_cfg.DEFAULT_SERVER_LANGUAGE, mlist=None):
265        self.__super_init(message, sequence, keepHeaders)
266        self.prev = None
267        self.next = None
268        # Trim Re: from the subject line
269        i = 0
270        while i != -1:
271            result = REpat.match(self.subject)
272            if result:
273                i = result.end(0)
274                self.subject = self.subject[i:]
275                if self.subject == '':
276                    self.subject = _('No subject')
277            else:
278                i = -1
279        # Useful to keep around
280        self._lang = lang
281        self._mlist = mlist
282
283        if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
284            # Avoid i18n side-effects.  Note that the language for this
285            # article (for this list) could be different from the site-wide
286            # preferred language, so we need to ensure no side-effects will
287            # occur.  Think what happens when executing bin/arch.
288            otrans = i18n.get_translation()
289            try:
290                i18n.set_language(lang)
291                if self.author == self.email:
292                    self.author = self.email = re.sub('@', _(' at '),
293                                                      self.email)
294                else:
295                    self.email = re.sub('@', _(' at '), self.email)
296            finally:
297                i18n.set_translation(otrans)
298
299        # Snag the content-* headers.  RFC 1521 states that their values are
300        # case insensitive.
301        ctype = message.get('Content-Type', 'text/plain')
302        cenc = message.get('Content-Transfer-Encoding', '')
303        self.ctype = ctype.lower()
304        self.cenc = cenc.lower()
305        self.decoded = {}
306        cset = Utils.GetCharSet(mlist.preferred_language)
307        cset_out = Charset(cset).output_charset or cset
308        if isinstance(cset_out, unicode):
309            # email 3.0.1 (python 2.4) doesn't like unicode
310            cset_out = cset_out.encode('us-ascii')
311        charset = message.get_content_charset(cset_out)
312        if charset:
313            charset = charset.lower().strip()
314            if charset[0]=='"' and charset[-1]=='"':
315                charset = charset[1:-1]
316            if charset[0]=="'" and charset[-1]=="'":
317                charset = charset[1:-1]
318            try:
319                body = message.get_payload(decode=True)
320            except binascii.Error:
321                body = None
322            if body and charset != Utils.GetCharSet(self._lang):
323                # decode body
324                try:
325                    body = unicode(body, charset)
326                except (UnicodeError, LookupError):
327                    body = None
328            if body:
329                self.body = [l + "\n" for l in body.splitlines()]
330
331        self.decode_headers()
332
333    # Mapping of listnames to MailList instances as a weak value dictionary.
334    # This code is copied from Runner.py but there's one important operational
335    # difference.  In Runner.py, we always .Load() the MailList object for
336    # each _dispose() run, otherwise the object retrieved from the cache won't
337    # be up-to-date.  Since we're creating a new HyperArchive instance for
338    # each message being archived, we don't need to worry about that -- but it
339    # does mean there are additional opportunities for optimization.
340    _listcache = weakref.WeakValueDictionary()
341
342    def _open_list(self, listname):
343        # Cache the open list so that any use of the list within this process
344        # uses the same object.  We use a WeakValueDictionary so that when the
345        # list is no longer necessary, its memory is freed.
346        mlist = self._listcache.get(listname)
347        if not mlist:
348            try:
349                mlist = MailList.MailList(listname, lock=0)
350            except Errors.MMListError, e:
351                syslog('error', 'error opening list: %s\n%s', listname, e)
352                return None
353            else:
354                self._listcache[listname] = mlist
355        return mlist
356
357    def __getstate__(self):
358        d = self.__dict__.copy()
359        # We definitely don't want to pickle the MailList instance, so just
360        # pickle a reference to it.
361        if d.has_key('_mlist'):
362            mlist = d['_mlist']
363            del d['_mlist']
364        else:
365            mlist = None
366        if mlist:
367            d['__listname'] = self._mlist.internal_name()
368        else:
369            d['__listname'] = None
370        # Delete a few other things we don't want in the pickle
371        for attr in ('prev', 'next', 'body'):
372            if d.has_key(attr):
373                del d[attr]
374        d['body'] = []
375        return d
376
377    def __setstate__(self, d):
378        # For loading older Articles via pickle.  All this stuff was added
379        # when Simone Piunni and Tokio Kikuchi i18n'ified Pipermail.  See SF
380        # patch #594771.
381        self.__dict__ = d
382        listname = d.get('__listname')
383        if listname:
384            del d['__listname']
385            d['_mlist'] = self._open_list(listname)
386        if not d.has_key('_lang'):
387            if hasattr(self, '_mlist'):
388                self._lang = self._mlist.preferred_language
389            else:
390                self._lang = mm_cfg.DEFAULT_SERVER_LANGUAGE
391        if not d.has_key('cenc'):
392            self.cenc = None
393        if not d.has_key('decoded'):
394            self.decoded = {}
395
396    def setListIfUnset(self, mlist):
397        if getattr(self, '_mlist', None) is None:
398            self._mlist = mlist
399
400    def quote(self, buf):
401        return html_quote(buf, self._lang)
402
403    def decode_headers(self):
404        """MIME-decode headers.
405
406        If the email, subject, or author attributes contain non-ASCII
407        characters using the encoded-word syntax of RFC 2047, decoded versions
408        of those attributes are placed in the self.decoded (a dictionary).
409
410        If the list's charset differs from the header charset, an attempt is
411        made to decode the headers as Unicode.  If that fails, they are left
412        undecoded.
413        """
414        author = self.decode_charset(self.author)
415        subject = self.decode_charset(self.subject)
416        if author:
417            self.decoded['author'] = author
418            email = self.decode_charset(self.email)
419            if email:
420                self.decoded['email'] = email
421        if subject:
422            if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
423                otrans = i18n.get_translation()
424                try:
425                    i18n.set_language(self._lang)
426                    atmark = unicode(_(' at '), Utils.GetCharSet(self._lang))
427                    subject = re.sub(r'([-+,.\w]+)@([-+.\w]+)',
428                              '\g<1>' + atmark + '\g<2>', subject)
429                finally:
430                    i18n.set_translation(otrans)
431            self.decoded['subject'] = subject
432        self.decoded['stripped'] = self.strip_subject(subject or self.subject)
433
434    def strip_subject(self, subject):
435        # Strip subject_prefix and Re: for subject sorting
436        # This part was taken from CookHeaders.py (TK)
437        prefix = self._mlist.subject_prefix.strip()
438        if prefix:
439            prefix_pat = re.escape(prefix)
440            prefix_pat = '%'.join(prefix_pat.split(r'\%'))
441            prefix_pat = re.sub(r'%\d*d', r'\s*\d+\s*', prefix_pat)
442            subject = re.sub(prefix_pat, '', subject)
443        subject = subject.lstrip()
444        # MAS Should we strip FW and FWD too?
445        strip_pat = re.compile('^((RE|AW|SV|VS)(\[\d+\])?:\s*)+', re.I)
446        stripped = strip_pat.sub('', subject)
447        # Also remove whitespace to avoid folding/unfolding differences
448        stripped = re.sub('\s', '', stripped)
449        return stripped
450
451    def decode_charset(self, field):
452        # TK: This function was rewritten for unifying to Unicode.
453        # Convert 'field' into Unicode one line string.
454        try:
455            pairs = decode_header(field)
456            ustr = make_header(pairs).__unicode__()
457        except (LookupError, UnicodeError, ValueError, HeaderParseError):
458            # assume list's language
459            cset = Utils.GetCharSet(self._mlist.preferred_language)
460            if cset == 'us-ascii':
461                cset = 'iso-8859-1' # assume this for English list
462            ustr = unicode(field, cset, 'replace')
463        return u''.join(ustr.splitlines())
464
465    def as_html(self):
466        d = self.__dict__.copy()
467        # avoid i18n side-effects
468        otrans = i18n.get_translation()
469        i18n.set_language(self._lang)
470        try:
471            d["prev"], d["prev_wsubj"] = self._get_prev()
472            d["next"], d["next_wsubj"] = self._get_next()
473
474            d["email_html"] = self.quote(self.email)
475            d["title"] = self.quote(self.subject)
476            d["subject_html"] = self.quote(self.subject)
477            d["message_id"] = self.quote(self._message_id)
478            # TK: These two _url variables are used to compose a response
479            # from the archive web page.  So, ...
480            d["subject_url"] = url_quote('Re: ' + self.subject)
481            d["in_reply_to_url"] = url_quote(self._message_id)
482            if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
483                # Point the mailto url back to the list
484                author = re.sub('@', _(' at '), self.author)
485                emailurl = self._mlist.GetListEmail()
486            else:
487                author = self.author
488                emailurl = self.email
489            d["author_html"] = self.quote(author)
490            d["email_url"] = url_quote(emailurl)
491            d["datestr_html"] = self.quote(i18n.ctime(int(self.date)))
492            d["body"] = self._get_body()
493            d['listurl'] = self._mlist.GetScriptURL('listinfo', absolute=1)
494            d['listname'] = self._mlist.real_name
495            d['encoding'] = ''
496        finally:
497            i18n.set_translation(otrans)
498
499        charset = Utils.GetCharSet(self._lang)
500        d["encoding"] = html_charset % charset
501
502        self._add_decoded(d)
503        return quick_maketext(
504             'article.html', d,
505             lang=self._lang, mlist=self._mlist)
506
507    def _get_prev(self):
508        """Return the href and subject for the previous message"""
509        if self.prev:
510            subject = self._get_subject_enc(self.prev)
511            prev = ('<LINK REL="Previous"  HREF="%s">'
512                    % (url_quote(self.prev.filename)))
513            prev_wsubj = ('<LI>' + _('Previous message (by thread):') +
514                          ' <A HREF="%s">%s\n</A></li>'
515                          % (url_quote(self.prev.filename),
516                             self.quote(subject)))
517        else:
518            prev = prev_wsubj = ""
519        return prev, prev_wsubj
520
521    def _get_subject_enc(self, art):
522        """Return the subject of art, decoded if possible.
523
524        If the charset of the current message and art match and the
525        article's subject is encoded, decode it.
526        """
527        return art.decoded.get('subject', art.subject)
528
529    def _get_next(self):
530        """Return the href and subject for the previous message"""
531        if self.next:
532            subject = self._get_subject_enc(self.next)
533            next = ('<LINK REL="Next"  HREF="%s">'
534                    % (url_quote(self.next.filename)))
535            next_wsubj = ('<LI>' + _('Next message (by thread):') +
536                          ' <A HREF="%s">%s\n</A></li>'
537                          % (url_quote(self.next.filename),
538                             self.quote(subject)))
539        else:
540            next = next_wsubj = ""
541        return next, next_wsubj
542
543    _rx_quote = re.compile('=([A-F0-9][A-F0-9])')
544    _rx_softline = re.compile('=[ \t]*$')
545
546    def _get_body(self):
547        """Return the message body ready for HTML, decoded if necessary"""
548        try:
549            body = self.html_body
550        except AttributeError:
551            body = self.body
552        return null_to_space(EMPTYSTRING.join(body))
553
554    def _add_decoded(self, d):
555        """Add encoded-word keys to HTML output"""
556        for src, dst in (('author', 'author_html'),
557                         ('email', 'email_html'),
558                         ('subject', 'subject_html'),
559                         ('subject', 'title')):
560            if self.decoded.has_key(src):
561                d[dst] = self.quote(self.decoded[src])
562
563    def as_text(self):
564        d = self.__dict__.copy()
565        # We need to guarantee a valid From_ line, even if there are
566        # bososities in the headers.
567        if not d.get('fromdate', '').strip():
568            d['fromdate'] = time.ctime(time.time())
569        if not d.get('email', '').strip():
570            d['email'] = 'bogus@does.not.exist.com'
571        if not d.get('datestr', '').strip():
572            d['datestr'] = time.ctime(time.time())
573        #
574        headers = ['From %(email)s  %(fromdate)s',
575                 'From: %(email)s (%(author)s)',
576                 'Date: %(datestr)s',
577                 'Subject: %(subject)s']
578        if d['_in_reply_to']:
579            headers.append('In-Reply-To: %(_in_reply_to)s')
580        if d['_references']:
581            headers.append('References: %(_references)s')
582        if d['_message_id']:
583            headers.append('Message-ID: %(_message_id)s')
584        body = EMPTYSTRING.join(self.body)
585        cset = Utils.GetCharSet(self._lang)
586        # Coerce the body to Unicode and replace any invalid characters.
587        if not isinstance(body, types.UnicodeType):
588            body = unicode(body, cset, 'replace')
589        if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
590            otrans = i18n.get_translation()
591            try:
592                i18n.set_language(self._lang)
593                atmark = unicode(_(' at '), cset)
594                body = re.sub(r'([-+,.\w]+)@([-+.\w]+)',
595                              '\g<1>' + atmark + '\g<2>', body)
596            finally:
597                i18n.set_translation(otrans)
598        # Return body to character set of article.
599        body = body.encode(cset, 'replace')
600        return NL.join(headers) % d + '\n\n' + body + '\n'
601
602    def _set_date(self, message):
603        self.__super_set_date(message)
604        self.fromdate = time.ctime(int(self.date))
605
606    def loadbody_fromHTML(self,fileobj):
607        self.body = []
608        begin = 0
609        while 1:
610            line = fileobj.readline()
611            if not line:
612                break
613            if not begin:
614                if line.strip() == '<!--beginarticle-->':
615                    begin = 1
616                continue
617            if line.strip() == '<!--endarticle-->':
618                break
619            self.body.append(line)
620
621    def finished_update_article(self):
622        self.body = []
623        try:
624            del self.html_body
625        except AttributeError:
626            pass
627
628
629class HyperArchive(pipermail.T):
630    __super_init = pipermail.T.__init__
631    __super_update_archive = pipermail.T.update_archive
632    __super_update_dirty_archives = pipermail.T.update_dirty_archives
633    __super_add_article = pipermail.T.add_article
634
635    # some defaults
636    DIRMODE = 02775
637    FILEMODE = 0660
638
639    VERBOSE = 0
640    DEFAULTINDEX = 'thread'
641    ARCHIVE_PERIOD = 'month'
642
643    THREADLAZY = 0
644    THREADLEVELS = 3
645
646    ALLOWHTML = 1             # "Lines between <html></html>" handled as is.
647    SHOWHTML = 0              # Eg, nuke leading whitespace in html manner.
648    IQUOTES = 1               # Italicize quoted text.
649    SHOWBR = 0                # Add <br> onto every line
650
651    def __init__(self, maillist):
652        # can't init the database while other processes are writing to it!
653        # XXX TODO- implement native locking
654        # with mailman's LockFile module for HyperDatabase.HyperDatabase
655        #
656        dir = maillist.archive_dir()
657        db = HyperDatabase.HyperDatabase(dir, maillist)
658        self.__super_init(dir, reload=1, database=db)
659
660        self.maillist = maillist
661        self._lock_file = None
662        self.lang = maillist.preferred_language
663        self.charset = Utils.GetCharSet(maillist.preferred_language)
664
665        if hasattr(self.maillist,'archive_volume_frequency'):
666            if self.maillist.archive_volume_frequency == 0:
667                self.ARCHIVE_PERIOD='year'
668            elif self.maillist.archive_volume_frequency == 2:
669                self.ARCHIVE_PERIOD='quarter'
670            elif self.maillist.archive_volume_frequency == 3:
671                self.ARCHIVE_PERIOD='week'
672            elif self.maillist.archive_volume_frequency == 4:
673                self.ARCHIVE_PERIOD='day'
674            else:
675                self.ARCHIVE_PERIOD='month'
676
677        yre = r'(?P<year>[0-9]{4,4})'
678        mre = r'(?P<month>[01][0-9])'
679        dre = r'(?P<day>[0123][0-9])'
680        self._volre = {
681            'year':    '^' + yre + '$',
682            'quarter': '^' + yre + r'q(?P<quarter>[1234])$',
683            'month':   '^' + yre + r'-(?P<month>[a-zA-Z]+)$',
684            'week':    r'^Week-of-Mon-' + yre + mre + dre,
685            'day':     '^' + yre + mre + dre + '$'
686            }
687
688    def _makeArticle(self, msg, sequence):
689        return Article(msg, sequence,
690                       lang=self.maillist.preferred_language,
691                       mlist=self.maillist)
692
693    def html_foot(self):
694        # avoid i18n side-effects
695        mlist = self.maillist
696        otrans = i18n.get_translation()
697        i18n.set_language(mlist.preferred_language)
698        # Convenience
699        def quotetime(s):
700            return html_quote(i18n.ctime(s), self.lang)
701        try:
702            d = {"lastdate": quotetime(self.lastdate),
703                 "archivedate": quotetime(self.archivedate),
704                 "listinfo": mlist.GetScriptURL('listinfo', absolute=1),
705                 "version": self.version,
706                 "listname": html_quote(mlist.real_name, self.lang),
707                 }
708            i = {"thread": _("thread"),
709                 "subject": _("subject"),
710                 "author": _("author"),
711                 "date": _("date")
712                 }
713        finally:
714            i18n.set_translation(otrans)
715
716        for t in i.keys():
717            cap = t[0].upper() + t[1:]
718            if self.type == cap:
719                d["%s_ref" % (t)] = ""
720            else:
721                d["%s_ref" % (t)] = ('<a href="%s.html#start">[ %s ]</a>'
722                                     % (t, i[t]))
723        return quick_maketext(
724            'archidxfoot.html', d,
725            mlist=mlist)
726
727    def html_head(self):
728        # avoid i18n side-effects
729        mlist = self.maillist
730        otrans = i18n.get_translation()
731        i18n.set_language(mlist.preferred_language)
732        # Convenience
733        def quotetime(s):
734            return html_quote(i18n.ctime(s), self.lang)
735        try:
736            d = {"listname": html_quote(mlist.real_name, self.lang),
737                 "archtype": self.type,
738                 "archive":  self.volNameToDesc(self.archive),
739                 "listinfo": mlist.GetScriptURL('listinfo', absolute=1),
740                 "firstdate": quotetime(self.firstdate),
741                 "lastdate": quotetime(self.lastdate),
742                 "size": self.size,
743                 }
744            i = {"thread": _("thread"),
745                 "subject": _("subject"),
746                 "author": _("author"),
747                 "date": _("date"),
748                 }
749        finally:
750            i18n.set_translation(otrans)
751
752        for t in i.keys():
753            cap = t[0].upper() + t[1:]
754            if self.type == cap:
755                d["%s_ref" % (t)] = ""
756                d["archtype"] = i[t]
757            else:
758                d["%s_ref" % (t)] = ('<a href="%s.html#start">[ %s ]</a>'
759                                     % (t, i[t]))
760        if self.charset:
761            d["encoding"] = html_charset % self.charset
762        else:
763            d["encoding"] = ""
764        return quick_maketext(
765            'archidxhead.html', d,
766            mlist=mlist)
767
768    def html_TOC(self):
769        mlist = self.maillist
770        listname = mlist.internal_name()
771        mbox = os.path.join(mlist.archive_dir()+'.mbox', listname+'.mbox')
772        d = {"listname": mlist.real_name,
773             "listinfo": mlist.GetScriptURL('listinfo', absolute=1),
774             "fullarch": '../%s.mbox/%s.mbox' % (listname, listname),
775             "size": sizeof(mbox, mlist.preferred_language),
776             'meta': '',
777             }
778        # Avoid i18n side-effects
779        otrans = i18n.get_translation()
780        i18n.set_language(mlist.preferred_language)
781        try:
782            if not self.archives:
783                d["noarchive_msg"] = _(
784                    '<P>Currently, there are no archives. </P>')
785                d["archive_listing_start"] = ""
786                d["archive_listing_end"] = ""
787                d["archive_listing"] = ""
788            else:
789                d["noarchive_msg"] = ""
790                d["archive_listing_start"] = quick_maketext(
791                    'archliststart.html',
792                    lang=mlist.preferred_language,
793                    mlist=mlist)
794                d["archive_listing_end"] = quick_maketext(
795                    'archlistend.html',
796                    mlist=mlist)
797
798                accum = []
799                for a in self.archives:
800                    accum.append(self.html_TOC_entry(a))
801                d["archive_listing"] = EMPTYSTRING.join(accum)
802        finally:
803            i18n.set_translation(otrans)
804        # The TOC is always in the charset of the list's preferred language
805        d['meta'] += html_charset % Utils.GetCharSet(mlist.preferred_language)
806        # The site can disable public access to the mbox file.
807        if mm_cfg.PUBLIC_MBOX:
808            template = 'archtoc.html'
809        else:
810            template = 'archtocnombox.html'
811        return quick_maketext(template, d, mlist=mlist)
812
813    def html_TOC_entry(self, arch):
814        # Check to see if the archive is gzip'd or not
815        txtfile = os.path.join(self.maillist.archive_dir(), arch + '.txt')
816        gzfile = txtfile + '.gz'
817        # which exists?  .txt.gz first, then .txt
818        if os.path.exists(gzfile):
819            file = gzfile
820            url = arch + '.txt.gz'
821            templ = '<td><A href="%(url)s">[ ' + _('Gzip\'d Text%(sz)s') \
822                    + ']</a></td>'
823        elif os.path.exists(txtfile):
824            file = txtfile
825            url = arch + '.txt'
826            templ = '<td><A href="%(url)s">[ ' + _('Text%(sz)s') + ']</a></td>'
827        else:
828            # neither found?
829            file = None
830        # in Python 1.5.2 we have an easy way to get the size
831        if file:
832            textlink = templ % {
833                'url': url,
834                'sz' : sizeof(file, self.maillist.preferred_language)
835                }
836        else:
837            # there's no archive file at all... hmmm.
838            textlink = ''
839        return quick_maketext(
840            'archtocentry.html',
841            {'archive': arch,
842             'archivelabel': self.volNameToDesc(arch),
843             'textlink': textlink
844             },
845            mlist=self.maillist)
846
847    def GetArchLock(self):
848        if self._lock_file:
849            return 1
850        self._lock_file = LockFile.LockFile(
851            os.path.join(mm_cfg.LOCK_DIR,
852                         self.maillist.internal_name() + '-arch.lock'))
853        try:
854            self._lock_file.lock(timeout=0.5)
855        except LockFile.TimeOutError:
856            return 0
857        return 1
858
859    def DropArchLock(self):
860        if self._lock_file:
861            self._lock_file.unlock(unconditionally=1)
862            self._lock_file = None
863
864    def processListArch(self):
865        name = self.maillist.ArchiveFileName()
866        wname= name+'.working'
867        ename= name+'.err_unarchived'
868        try:
869            os.stat(name)
870        except (IOError,os.error):
871            #no archive file, nothin to do -ddm
872            return
873
874        #see if arch is locked here -ddm
875        if not self.GetArchLock():
876            #another archiver is running, nothing to do. -ddm
877            return
878
879        #if the working file is still here, the archiver may have
880        # crashed during archiving. Save it, log an error, and move on.
881        try:
882            wf = open(wname)
883            syslog('error',
884                   'Archive working file %s present.  '
885                   'Check %s for possibly unarchived msgs',
886                   wname, ename)
887            omask = os.umask(007)
888            try:
889                ef = open(ename, 'a+')
890            finally:
891                os.umask(omask)
892            ef.seek(1,2)
893            if ef.read(1) <> '\n':
894                ef.write('\n')
895            ef.write(wf.read())
896            ef.close()
897            wf.close()
898            os.unlink(wname)
899        except IOError:
900            pass
901        os.rename(name,wname)
902        archfile = open(wname)
903        self.processUnixMailbox(archfile)
904        archfile.close()
905        os.unlink(wname)
906        self.DropArchLock()
907
908    def get_filename(self, article):
909        return '%06i.html' % (article.sequence,)
910
911    def get_archives(self, article):
912        """Return a list of indexes where the article should be filed.
913        A string can be returned if the list only contains one entry,
914        and the empty list is legal."""
915        res = self.dateToVolName(float(article.date))
916        self.message(C_("figuring article archives\n"))
917        self.message(res + "\n")
918        return res
919
920    def volNameToDesc(self, volname):
921        volname = volname.strip()
922        # Don't make these module global constants since we have to runtime
923        # translate them anyway.
924        monthdict = [
925            '',
926            _('January'),   _('February'), _('March'),    _('April'),
927            _('May'),       _('June'),     _('July'),     _('August'),
928            _('September'), _('October'),  _('November'), _('December')
929            ]
930        for each in self._volre.keys():
931            match = re.match(self._volre[each], volname)
932            # Let ValueErrors percolate up
933            if match:
934                year = int(match.group('year'))
935                if each == 'quarter':
936                    d =["", _("First"), _("Second"), _("Third"), _("Fourth") ]
937                    ord = d[int(match.group('quarter'))]
938                    return _("%(ord)s quarter %(year)i")
939                elif each == 'month':
940                    monthstr = match.group('month').lower()
941                    for i in range(1, 13):
942                        monthname = time.strftime("%B", (1999,i,1,0,0,0,0,1,0))
943                        if monthstr.lower() == monthname.lower():
944                            month = monthdict[i]
945                            return _("%(month)s %(year)i")
946                    raise ValueError, "%s is not a month!" % monthstr
947                elif each == 'week':
948                    month = monthdict[int(match.group("month"))]
949                    day = int(match.group("day"))
950                    return _("The Week Of Monday %(day)i %(month)s %(year)i")
951                elif each == 'day':
952                    month = monthdict[int(match.group("month"))]
953                    day = int(match.group("day"))
954                    return _("%(day)i %(month)s %(year)i")
955                else:
956                    return match.group('year')
957        raise ValueError, "%s is not a valid volname" % volname
958
959# The following two methods should be inverses of each other. -ddm
960
961    def dateToVolName(self,date):
962        datetuple=time.localtime(date)
963        if self.ARCHIVE_PERIOD=='year':
964            return time.strftime("%Y",datetuple)
965        elif self.ARCHIVE_PERIOD=='quarter':
966            if datetuple[1] in [1,2,3]:
967                return time.strftime("%Yq1",datetuple)
968            elif datetuple[1] in [4,5,6]:
969                return time.strftime("%Yq2",datetuple)
970            elif datetuple[1] in [7,8,9]:
971                return time.strftime("%Yq3",datetuple)
972            else:
973                return time.strftime("%Yq4",datetuple)
974        elif self.ARCHIVE_PERIOD == 'day':
975            return time.strftime("%Y%m%d", datetuple)
976        elif self.ARCHIVE_PERIOD == 'week':
977            # Reconstruct "seconds since epoch", and subtract weekday
978            # multiplied by the number of seconds in a day.
979            monday = time.mktime(datetuple) - datetuple[6] * 24 * 60 * 60
980            # Build a new datetuple from this "seconds since epoch" value
981            datetuple = time.localtime(monday)
982            return time.strftime("Week-of-Mon-%Y%m%d", datetuple)
983        # month. -ddm
984        else:
985            return time.strftime("%Y-%B",datetuple)
986
987
988    def volNameToDate(self, volname):
989        volname = volname.strip()
990        for each in self._volre.keys():
991            match = re.match(self._volre[each],volname)
992            if match:
993                year = int(match.group('year'))
994                month = 1
995                day = 1
996                if each == 'quarter':
997                    q = int(match.group('quarter'))
998                    month = (q * 3) - 2
999                elif each == 'month':
1000                    monthstr = match.group('month').lower()
1001                    m = []
1002                    for i in range(1,13):
1003                        m.append(
1004                            time.strftime("%B",(1999,i,1,0,0,0,0,1,0)).lower())
1005                    try:
1006                        month = m.index(monthstr) + 1
1007                    except ValueError:
1008                        pass
1009                elif each == 'week' or each == 'day':
1010                    month = int(match.group("month"))
1011                    day = int(match.group("day"))
1012                try:
1013                    return time.mktime((year,month,1,0,0,0,0,1,-1))
1014                except OverflowError:
1015                    return 0.0
1016        return 0.0
1017
1018    def sortarchives(self):
1019        def sf(a, b):
1020            al = self.volNameToDate(a)
1021            bl = self.volNameToDate(b)
1022            if al > bl:
1023                return 1
1024            elif al < bl:
1025                return -1
1026            else:
1027                return 0
1028        if self.ARCHIVE_PERIOD in ('month','year','quarter'):
1029            self.archives.sort(sf)
1030        else:
1031            self.archives.sort()
1032        self.archives.reverse()
1033
1034    def message(self, msg):
1035        if self.VERBOSE:
1036            f = sys.stderr
1037            f.write(msg)
1038            if msg[-1:] != '\n':
1039                f.write('\n')
1040            f.flush()
1041
1042    def open_new_archive(self, archive, archivedir):
1043        index_html = os.path.join(archivedir, 'index.html')
1044        try:
1045            os.unlink(index_html)
1046        except:
1047            pass
1048        os.symlink(self.DEFAULTINDEX+'.html',index_html)
1049
1050    def write_index_header(self):
1051        self.depth=0
1052        print self.html_head()
1053        if not self.THREADLAZY and self.type=='Thread':
1054            self.message(C_("Computing threaded index\n"))
1055            self.updateThreadedIndex()
1056
1057    def write_index_footer(self):
1058        for i in range(self.depth):
1059            print '</UL>'
1060        print self.html_foot()
1061
1062    def write_index_entry(self, article):
1063        subject = self.get_header("subject", article)
1064        author = self.get_header("author", article)
1065        if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
1066            try:
1067                author = re.sub('@', _(' at '), author)
1068            except UnicodeError:
1069                # Non-ASCII author contains '@' ... no valid email anyway
1070                pass
1071        subject = CGIescape(subject, self.lang)
1072        author = CGIescape(author, self.lang)
1073
1074        d = {
1075            'filename': urllib.quote(article.filename),
1076            'subject':  subject,
1077            'sequence': article.sequence,
1078            'author':   author
1079        }
1080        print quick_maketext(
1081            'archidxentry.html', d,
1082            mlist=self.maillist)
1083
1084    def get_header(self, field, article):
1085        # if we have no decoded header, return the encoded one
1086        result = article.decoded.get(field)
1087        if result is None:
1088            return getattr(article, field)
1089        # otherwise, the decoded one will be Unicode
1090        return result
1091
1092    def write_threadindex_entry(self, article, depth):
1093        if depth < 0:
1094            self.message('depth<0')
1095            depth = 0
1096        if depth > self.THREADLEVELS:
1097            depth = self.THREADLEVELS
1098        if depth < self.depth:
1099            for i in range(self.depth-depth):
1100                print '</UL>'
1101        elif depth > self.depth:
1102            for i in range(depth-self.depth):
1103                print '<UL>'
1104        print '<!--%i %s -->' % (depth, article.threadKey)
1105        self.depth = depth
1106        self.write_index_entry(article)
1107
1108    def write_TOC(self):
1109        self.sortarchives()
1110        omask = os.umask(002)
1111        try:
1112            toc = open(os.path.join(self.basedir, 'index.html'), 'w')
1113        finally:
1114            os.umask(omask)
1115        toc.write(self.html_TOC())
1116        toc.close()
1117
1118    def write_article(self, index, article, path):
1119        # called by add_article
1120        omask = os.umask(002)
1121        try:
1122            f = open(path, 'w')
1123        finally:
1124            os.umask(omask)
1125        f.write(article.as_html())
1126        f.close()
1127
1128        # Write the text article to the text archive.
1129        path = os.path.join(self.basedir, "%s.txt" % index)
1130        omask = os.umask(002)
1131        try:
1132            f = open(path, 'a+')
1133        finally:
1134            os.umask(omask)
1135        f.write(article.as_text())
1136        f.close()
1137
1138    def update_archive(self, archive):
1139        self.__super_update_archive(archive)
1140        # only do this if the gzip module was imported globally, and
1141        # gzip'ing was enabled via mm_cfg.GZIP_ARCHIVE_TXT_FILES.  See
1142        # above.
1143        if gzip:
1144            archz = None
1145            archt = None
1146            txtfile = os.path.join(self.basedir, '%s.txt' % archive)
1147            gzipfile = os.path.join(self.basedir, '%s.txt.gz' % archive)
1148            oldgzip = os.path.join(self.basedir, '%s.old.txt.gz' % archive)
1149            try:
1150                # open the plain text file
1151                archt = open(txtfile)
1152            except IOError:
1153                return
1154            try:
1155                os.rename(gzipfile, oldgzip)
1156                archz = gzip.open(oldgzip)
1157            except (IOError, RuntimeError, os.error):
1158                pass
1159            try:
1160                ou = os.umask(002)
1161                newz = gzip.open(gzipfile, 'w')
1162            finally:
1163                # XXX why is this a finally?
1164                os.umask(ou)
1165            if archz:
1166                newz.write(archz.read())
1167                archz.close()
1168                os.unlink(oldgzip)
1169            # XXX do we really need all this in a try/except?
1170            try:
1171                newz.write(archt.read())
1172                newz.close()
1173                archt.close()
1174            except IOError:
1175                pass
1176            os.unlink(txtfile)
1177
1178    _skip_attrs = ('maillist', '_lock_file', 'charset')
1179
1180    def getstate(self):
1181        d={}
1182        for each in self.__dict__.keys():
1183            if not (each in self._skip_attrs
1184                    or each.upper() == each):
1185                d[each] = self.__dict__[each]
1186        return d
1187
1188    # Add <A HREF="..."> tags around URLs and e-mail addresses.
1189
1190    def __processbody_URLquote(self, lines):
1191        # XXX a lot to do here:
1192        # 1. use lines directly, rather than source and dest
1193        # 2. make it clearer
1194        # 3. make it faster
1195        # TK: Prepare for unicode obscure.
1196        atmark = _(' at ')
1197        if lines and isinstance(lines[0], types.UnicodeType):
1198            atmark = unicode(atmark, Utils.GetCharSet(self.lang), 'replace')
1199        source = lines[:]
1200        dest = lines
1201        last_line_was_quoted = 0
1202        for i in xrange(0, len(source)):
1203            Lorig = L = source[i]
1204            prefix = suffix = ""
1205            if L is None:
1206                continue
1207            # Italicise quoted text
1208            if self.IQUOTES:
1209                quoted = quotedpat.match(L)
1210                if quoted is None:
1211                    last_line_was_quoted = 0
1212                else:
1213                    quoted = quoted.end(0)
1214                    prefix = CGIescape(L[:quoted], self.lang) + '<i>'
1215                    suffix = '</I>'
1216                    if self.SHOWHTML:
1217                        suffix += '<BR>'
1218                        if not last_line_was_quoted:
1219                            prefix = '<BR>' + prefix
1220                    L = L[quoted:]
1221                    last_line_was_quoted = 1
1222            # Check for an e-mail address
1223            L2 = ""
1224            jr = emailpat.search(L)
1225            kr = urlpat.search(L)
1226            while jr is not None or kr is not None:
1227                if jr == None:
1228                    j = -1
1229                else:
1230                    j = jr.start(0)
1231                if kr is None:
1232                    k = -1
1233                else:
1234                    k = kr.start(0)
1235                if j != -1 and (j < k or k == -1):
1236                    text = jr.group(1)
1237                    length = len(text)
1238                    if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
1239                        text = re.sub('@', atmark, text)
1240                        URL = self.maillist.GetScriptURL(
1241                            'listinfo', absolute=1)
1242                    else:
1243                        URL = 'mailto:' + text
1244                    pos = j
1245                elif k != -1 and (j > k or j == -1):
1246                    text = URL = kr.group(1)
1247                    length = len(text)
1248                    pos = k
1249                else: # j==k
1250                    raise ValueError, "j==k: This can't happen!"
1251                #length = len(text)
1252                #self.message("URL: %s %s %s \n"
1253                #             % (CGIescape(L[:pos]), URL, CGIescape(text)))
1254                L2 += '%s<A HREF="%s">%s</A>' % (
1255                    CGIescape(L[:pos], self.lang),
1256                    html_quote(URL), CGIescape(text, self.lang))
1257                L = L[pos+length:]
1258                jr = emailpat.search(L)
1259                kr = urlpat.search(L)
1260            if jr is None and kr is None:
1261                L = CGIescape(L, self.lang)
1262            L = prefix + L2 + L + suffix
1263            source[i] = None
1264            dest[i] = L
1265
1266    # Perform Hypermail-style processing of <HTML></HTML> directives
1267    # in message bodies.  Lines between <HTML> and </HTML> will be written
1268    # out precisely as they are; other lines will be passed to func2
1269    # for further processing .
1270
1271    def __processbody_HTML(self, lines):
1272        # XXX need to make this method modify in place
1273        source = lines[:]
1274        dest = lines
1275        l = len(source)
1276        i = 0
1277        while i < l:
1278            while i < l and htmlpat.match(source[i]) is None:
1279                i = i + 1
1280            if i < l:
1281                source[i] = None
1282                i = i + 1
1283            while i < l and nohtmlpat.match(source[i]) is None:
1284                dest[i], source[i] = source[i], None
1285                i = i + 1
1286            if i < l:
1287                source[i] = None
1288                i = i + 1
1289
1290    def format_article(self, article):
1291        # called from add_article
1292        # TBD: Why do the HTML formatting here and keep it in the
1293        # pipermail database?  It makes more sense to do the html
1294        # formatting as the article is being written as html and toss
1295        # the data after it has been written to the archive file.
1296        lines = filter(None, article.body)
1297        # Handle <HTML> </HTML> directives
1298        if self.ALLOWHTML:
1299            self.__processbody_HTML(lines)
1300        self.__processbody_URLquote(lines)
1301        if not self.SHOWHTML and lines:
1302            lines.insert(0, '<PRE>')
1303            lines.append('</PRE>')
1304        else:
1305            # Do fancy formatting here
1306            if self.SHOWBR:
1307                lines = map(lambda x:x + "<BR>", lines)
1308            else:
1309                for i in range(0, len(lines)):
1310                    s = lines[i]
1311                    if s[0:1] in ' \t\n':
1312                        lines[i] = '<P>' + s
1313        article.html_body = lines
1314        return article
1315
1316    def update_article(self, arcdir, article, prev, next):
1317        seq = article.sequence
1318        filename = os.path.join(arcdir, article.filename)
1319        self.message(C_('Updating HTML for article %(seq)s'))
1320        try:
1321            f = open(filename)
1322            article.loadbody_fromHTML(f)
1323            f.close()
1324        except IOError, e:
1325            if e.errno <> errno.ENOENT: raise
1326            self.message(C_('article file %(filename)s is missing!'))
1327        article.prev = prev
1328        article.next = next
1329        omask = os.umask(002)
1330        try:
1331            f = open(filename, 'w')
1332        finally:
1333            os.umask(omask)
1334        f.write(article.as_html())
1335        f.close()
1336