1# Copyright (C) 1998-2018 by the Free Software Foundation, Inc. 2# 3# This program is free software; you can redistribute it and/or 4# modify it under the terms of the GNU General Public License 5# as published by the Free Software Foundation; either version 2 6# of the License, or (at your option) any later version. 7# 8# This program is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11# GNU General Public License for more details. 12# 13# You should have received a copy of the GNU General Public License 14# along with this program; if not, write to the Free Software 15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, 16# USA. 17 18"""HyperArch: Pipermail archiving for Mailman 19 20 - The Dragon De Monsyne <dragondm@integral.org> 21 22 TODO: 23 - Should be able to force all HTML to be regenerated next time the 24 archive is run, in case a template is changed. 25 - Run a command to generate tarball of html archives for downloading 26 (probably in the 'update_dirty_archives' method). 27""" 28 29from __future__ import nested_scopes 30 31import sys 32import re 33import errno 34import urllib 35import time 36import os 37import types 38import HyperDatabase 39import pipermail 40import weakref 41import binascii 42 43from email.Header import decode_header, make_header 44from email.Errors import HeaderParseError 45from email.Charset import Charset 46 47from Mailman import mm_cfg 48from Mailman import Utils 49from Mailman import Errors 50from Mailman import LockFile 51from Mailman import MailList 52from Mailman import i18n 53from Mailman.SafeDict import SafeDict 54from Mailman.Logging.Syslog import syslog 55from Mailman.Mailbox import ArchiverMailbox 56 57# Set up i18n. Assume the current language has already been set in the caller. 58_ = i18n._ 59C_ = i18n.C_ 60 61gzip = None 62if mm_cfg.GZIP_ARCHIVE_TXT_FILES: 63 try: 64 import gzip 65 except ImportError: 66 pass 67 68EMPTYSTRING = '' 69NL = '\n' 70 71# MacOSX has a default stack size that is too small for deeply recursive 72# regular expressions. We see this as crashes in the Python test suite when 73# running test_re.py and test_sre.py. The fix is to set the stack limit to 74# 2048; the general recommendation is to do in the shell before running the 75# test suite. But that's inconvenient for a daemon like the qrunner. 76# 77# AFAIK, this problem only affects the archiver, so we're adding this work 78# around to this file (it'll get imported by the bundled pipermail or by the 79# bin/arch script. We also only do this on darwin, a.k.a. MacOSX. 80if sys.platform == 'darwin': 81 try: 82 import resource 83 except ImportError: 84 pass 85 else: 86 soft, hard = resource.getrlimit(resource.RLIMIT_STACK) 87 newsoft = min(hard, max(soft, 1024*2048)) 88 resource.setrlimit(resource.RLIMIT_STACK, (newsoft, hard)) 89 90 91try: 92 True, False 93except NameError: 94 True = 1 95 False = 0 96 97 98 99def html_quote(s, lang=None): 100 repls = ( ('&', '&'), 101 ("<", '<'), 102 (">", '>'), 103 ('"', '"')) 104 for thing, repl in repls: 105 s = s.replace(thing, repl) 106 return Utils.uncanonstr(s, lang) 107 108 109def url_quote(s): 110 return urllib.quote(s) 111 112 113def null_to_space(s): 114 return s.replace('\000', ' ') 115 116 117def sizeof(filename, lang): 118 try: 119 size = os.path.getsize(filename) 120 except OSError, e: 121 # ENOENT can happen if the .mbox file was moved away or deleted, and 122 # an explicit mbox file name was given to bin/arch. 123 if e.errno <> errno.ENOENT: raise 124 return _('size not available') 125 if size < 1000: 126 # Avoid i18n side-effects 127 otrans = i18n.get_translation() 128 try: 129 i18n.set_language(lang) 130 out = _(' %(size)i bytes ') 131 finally: 132 i18n.set_translation(otrans) 133 return out 134 elif size < 1000000: 135 return ' %d KB ' % (size / 1000) 136 # GB?? :-) 137 return ' %d MB ' % (size / 1000000) 138 139 140html_charset = '<META http-equiv="Content-Type" ' \ 141 'content="text/html; charset=%s">' 142 143def CGIescape(arg, lang=None): 144 if isinstance(arg, types.UnicodeType): 145 s = Utils.websafe(arg) 146 else: 147 s = Utils.websafe(str(arg)) 148 return Utils.uncanonstr(s.replace('"', '"'), lang) 149 150# Parenthesized human name 151paren_name_pat = re.compile(r'([(].*[)])') 152 153# Subject lines preceded with 'Re:' 154REpat = re.compile( r"\s*RE\s*(\[\d+\]\s*)?:\s*", re.IGNORECASE) 155 156# E-mail addresses and URLs in text 157emailpat = re.compile(r'([-+,.\w]+@[-+.\w]+)') 158 159# Argh! This pattern is buggy, and will choke on URLs with GET parameters. 160# MAS: Given that people are not constrained in how they write URIs in plain 161# text, it is not possible to have a single regexp to reliably match them. 162# The regexp below is intended to match straightforward cases. Even humans 163# can't reliably tell whether various punctuation at the end of a URI is part 164# of the URI or not. 165urlpat = re.compile(r'([a-z]+://.*?)(?:_\s|_$|$|[]})>\'"\s])', re.IGNORECASE) 166 167# Blank lines 168blankpat = re.compile(r'^\s*$') 169 170# Starting <html> directive 171htmlpat = re.compile(r'^\s*<HTML>\s*$', re.IGNORECASE) 172# Ending </html> directive 173nohtmlpat = re.compile(r'^\s*</HTML>\s*$', re.IGNORECASE) 174# Match quoted text 175quotedpat = re.compile(r'^([>|:]|>)+') 176 177 178 179# Like Utils.maketext() but with caching to improve performance. 180# 181# _templatefilepathcache is used to associate a (templatefile, lang, listname) 182# key with the file system path to a template file. This path is the one that 183# the Utils.findtext() function has computed is the one to match the values in 184# the key tuple. 185# 186# _templatecache associate a file system path as key with the text 187# returned after processing the contents of that file by Utils.findtext() 188# 189# We keep two caches to reduce the amount of template text kept in memory, 190# since the _templatefilepathcache is a many->one mapping and _templatecache 191# is a one->one mapping. Imagine 1000 lists all using the same default 192# English template. 193 194_templatefilepathcache = {} 195_templatecache = {} 196 197def quick_maketext(templatefile, dict=None, lang=None, mlist=None): 198 if mlist is None: 199 listname = '' 200 else: 201 listname = mlist._internal_name 202 if lang is None: 203 if mlist is None: 204 lang = mm_cfg.DEFAULT_SERVER_LANGUAGE 205 else: 206 lang = mlist.preferred_language 207 cachekey = (templatefile, lang, listname) 208 filepath = _templatefilepathcache.get(cachekey) 209 if filepath: 210 template = _templatecache.get(filepath) 211 if filepath is None or template is None: 212 # Use the basic maketext, with defaults to get the raw template 213 template, filepath = Utils.findtext(templatefile, lang=lang, 214 raw=True, mlist=mlist) 215 _templatefilepathcache[cachekey] = filepath 216 _templatecache[filepath] = template 217 # Copied from Utils.maketext() 218 text = template 219 if dict is not None: 220 try: 221 sdict = SafeDict(dict) 222 try: 223 text = sdict.interpolate(template) 224 except UnicodeError: 225 # Try again after coercing the template to unicode 226 utemplate = unicode(template, 227 Utils.GetCharSet(lang), 228 'replace') 229 text = sdict.interpolate(utemplate) 230 except (TypeError, ValueError), e: 231 # The template is really screwed up 232 syslog('error', 'broken template: %s\n%s', filepath, e) 233 # Make sure the text is in the given character set, or html-ify any bogus 234 # characters. 235 return Utils.uncanonstr(text, lang) 236 237 238 239# Note: I'm overriding most, if not all of the pipermail Article class 240# here -ddm 241# The Article class encapsulates a single posting. The attributes are: 242# 243# sequence : Sequence number, unique for each article in a set of archives 244# subject : Subject 245# datestr : The posting date, in human-readable format 246# date : The posting date, in purely numeric format 247# fromdate : The posting date, in `unixfrom' format 248# headers : Any other headers of interest 249# author : The author's name (and possibly organization) 250# email : The author's e-mail address 251# msgid : A unique message ID 252# in_reply_to : If !="", this is the msgid of the article being replied to 253# references: A (possibly empty) list of msgid's of earlier articles in 254# the thread 255# body : A list of strings making up the message body 256 257class Article(pipermail.Article): 258 __super_init = pipermail.Article.__init__ 259 __super_set_date = pipermail.Article._set_date 260 261 _last_article_time = time.time() 262 263 def __init__(self, message=None, sequence=0, keepHeaders=[], 264 lang=mm_cfg.DEFAULT_SERVER_LANGUAGE, mlist=None): 265 self.__super_init(message, sequence, keepHeaders) 266 self.prev = None 267 self.next = None 268 # Trim Re: from the subject line 269 i = 0 270 while i != -1: 271 result = REpat.match(self.subject) 272 if result: 273 i = result.end(0) 274 self.subject = self.subject[i:] 275 if self.subject == '': 276 self.subject = _('No subject') 277 else: 278 i = -1 279 # Useful to keep around 280 self._lang = lang 281 self._mlist = mlist 282 283 if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: 284 # Avoid i18n side-effects. Note that the language for this 285 # article (for this list) could be different from the site-wide 286 # preferred language, so we need to ensure no side-effects will 287 # occur. Think what happens when executing bin/arch. 288 otrans = i18n.get_translation() 289 try: 290 i18n.set_language(lang) 291 if self.author == self.email: 292 self.author = self.email = re.sub('@', _(' at '), 293 self.email) 294 else: 295 self.email = re.sub('@', _(' at '), self.email) 296 finally: 297 i18n.set_translation(otrans) 298 299 # Snag the content-* headers. RFC 1521 states that their values are 300 # case insensitive. 301 ctype = message.get('Content-Type', 'text/plain') 302 cenc = message.get('Content-Transfer-Encoding', '') 303 self.ctype = ctype.lower() 304 self.cenc = cenc.lower() 305 self.decoded = {} 306 cset = Utils.GetCharSet(mlist.preferred_language) 307 cset_out = Charset(cset).output_charset or cset 308 if isinstance(cset_out, unicode): 309 # email 3.0.1 (python 2.4) doesn't like unicode 310 cset_out = cset_out.encode('us-ascii') 311 charset = message.get_content_charset(cset_out) 312 if charset: 313 charset = charset.lower().strip() 314 if charset[0]=='"' and charset[-1]=='"': 315 charset = charset[1:-1] 316 if charset[0]=="'" and charset[-1]=="'": 317 charset = charset[1:-1] 318 try: 319 body = message.get_payload(decode=True) 320 except binascii.Error: 321 body = None 322 if body and charset != Utils.GetCharSet(self._lang): 323 # decode body 324 try: 325 body = unicode(body, charset) 326 except (UnicodeError, LookupError): 327 body = None 328 if body: 329 self.body = [l + "\n" for l in body.splitlines()] 330 331 self.decode_headers() 332 333 # Mapping of listnames to MailList instances as a weak value dictionary. 334 # This code is copied from Runner.py but there's one important operational 335 # difference. In Runner.py, we always .Load() the MailList object for 336 # each _dispose() run, otherwise the object retrieved from the cache won't 337 # be up-to-date. Since we're creating a new HyperArchive instance for 338 # each message being archived, we don't need to worry about that -- but it 339 # does mean there are additional opportunities for optimization. 340 _listcache = weakref.WeakValueDictionary() 341 342 def _open_list(self, listname): 343 # Cache the open list so that any use of the list within this process 344 # uses the same object. We use a WeakValueDictionary so that when the 345 # list is no longer necessary, its memory is freed. 346 mlist = self._listcache.get(listname) 347 if not mlist: 348 try: 349 mlist = MailList.MailList(listname, lock=0) 350 except Errors.MMListError, e: 351 syslog('error', 'error opening list: %s\n%s', listname, e) 352 return None 353 else: 354 self._listcache[listname] = mlist 355 return mlist 356 357 def __getstate__(self): 358 d = self.__dict__.copy() 359 # We definitely don't want to pickle the MailList instance, so just 360 # pickle a reference to it. 361 if d.has_key('_mlist'): 362 mlist = d['_mlist'] 363 del d['_mlist'] 364 else: 365 mlist = None 366 if mlist: 367 d['__listname'] = self._mlist.internal_name() 368 else: 369 d['__listname'] = None 370 # Delete a few other things we don't want in the pickle 371 for attr in ('prev', 'next', 'body'): 372 if d.has_key(attr): 373 del d[attr] 374 d['body'] = [] 375 return d 376 377 def __setstate__(self, d): 378 # For loading older Articles via pickle. All this stuff was added 379 # when Simone Piunni and Tokio Kikuchi i18n'ified Pipermail. See SF 380 # patch #594771. 381 self.__dict__ = d 382 listname = d.get('__listname') 383 if listname: 384 del d['__listname'] 385 d['_mlist'] = self._open_list(listname) 386 if not d.has_key('_lang'): 387 if hasattr(self, '_mlist'): 388 self._lang = self._mlist.preferred_language 389 else: 390 self._lang = mm_cfg.DEFAULT_SERVER_LANGUAGE 391 if not d.has_key('cenc'): 392 self.cenc = None 393 if not d.has_key('decoded'): 394 self.decoded = {} 395 396 def setListIfUnset(self, mlist): 397 if getattr(self, '_mlist', None) is None: 398 self._mlist = mlist 399 400 def quote(self, buf): 401 return html_quote(buf, self._lang) 402 403 def decode_headers(self): 404 """MIME-decode headers. 405 406 If the email, subject, or author attributes contain non-ASCII 407 characters using the encoded-word syntax of RFC 2047, decoded versions 408 of those attributes are placed in the self.decoded (a dictionary). 409 410 If the list's charset differs from the header charset, an attempt is 411 made to decode the headers as Unicode. If that fails, they are left 412 undecoded. 413 """ 414 author = self.decode_charset(self.author) 415 subject = self.decode_charset(self.subject) 416 if author: 417 self.decoded['author'] = author 418 email = self.decode_charset(self.email) 419 if email: 420 self.decoded['email'] = email 421 if subject: 422 if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: 423 otrans = i18n.get_translation() 424 try: 425 i18n.set_language(self._lang) 426 atmark = unicode(_(' at '), Utils.GetCharSet(self._lang)) 427 subject = re.sub(r'([-+,.\w]+)@([-+.\w]+)', 428 '\g<1>' + atmark + '\g<2>', subject) 429 finally: 430 i18n.set_translation(otrans) 431 self.decoded['subject'] = subject 432 self.decoded['stripped'] = self.strip_subject(subject or self.subject) 433 434 def strip_subject(self, subject): 435 # Strip subject_prefix and Re: for subject sorting 436 # This part was taken from CookHeaders.py (TK) 437 prefix = self._mlist.subject_prefix.strip() 438 if prefix: 439 prefix_pat = re.escape(prefix) 440 prefix_pat = '%'.join(prefix_pat.split(r'\%')) 441 prefix_pat = re.sub(r'%\d*d', r'\s*\d+\s*', prefix_pat) 442 subject = re.sub(prefix_pat, '', subject) 443 subject = subject.lstrip() 444 # MAS Should we strip FW and FWD too? 445 strip_pat = re.compile('^((RE|AW|SV|VS)(\[\d+\])?:\s*)+', re.I) 446 stripped = strip_pat.sub('', subject) 447 # Also remove whitespace to avoid folding/unfolding differences 448 stripped = re.sub('\s', '', stripped) 449 return stripped 450 451 def decode_charset(self, field): 452 # TK: This function was rewritten for unifying to Unicode. 453 # Convert 'field' into Unicode one line string. 454 try: 455 pairs = decode_header(field) 456 ustr = make_header(pairs).__unicode__() 457 except (LookupError, UnicodeError, ValueError, HeaderParseError): 458 # assume list's language 459 cset = Utils.GetCharSet(self._mlist.preferred_language) 460 if cset == 'us-ascii': 461 cset = 'iso-8859-1' # assume this for English list 462 ustr = unicode(field, cset, 'replace') 463 return u''.join(ustr.splitlines()) 464 465 def as_html(self): 466 d = self.__dict__.copy() 467 # avoid i18n side-effects 468 otrans = i18n.get_translation() 469 i18n.set_language(self._lang) 470 try: 471 d["prev"], d["prev_wsubj"] = self._get_prev() 472 d["next"], d["next_wsubj"] = self._get_next() 473 474 d["email_html"] = self.quote(self.email) 475 d["title"] = self.quote(self.subject) 476 d["subject_html"] = self.quote(self.subject) 477 d["message_id"] = self.quote(self._message_id) 478 # TK: These two _url variables are used to compose a response 479 # from the archive web page. So, ... 480 d["subject_url"] = url_quote('Re: ' + self.subject) 481 d["in_reply_to_url"] = url_quote(self._message_id) 482 if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: 483 # Point the mailto url back to the list 484 author = re.sub('@', _(' at '), self.author) 485 emailurl = self._mlist.GetListEmail() 486 else: 487 author = self.author 488 emailurl = self.email 489 d["author_html"] = self.quote(author) 490 d["email_url"] = url_quote(emailurl) 491 d["datestr_html"] = self.quote(i18n.ctime(int(self.date))) 492 d["body"] = self._get_body() 493 d['listurl'] = self._mlist.GetScriptURL('listinfo', absolute=1) 494 d['listname'] = self._mlist.real_name 495 d['encoding'] = '' 496 finally: 497 i18n.set_translation(otrans) 498 499 charset = Utils.GetCharSet(self._lang) 500 d["encoding"] = html_charset % charset 501 502 self._add_decoded(d) 503 return quick_maketext( 504 'article.html', d, 505 lang=self._lang, mlist=self._mlist) 506 507 def _get_prev(self): 508 """Return the href and subject for the previous message""" 509 if self.prev: 510 subject = self._get_subject_enc(self.prev) 511 prev = ('<LINK REL="Previous" HREF="%s">' 512 % (url_quote(self.prev.filename))) 513 prev_wsubj = ('<LI>' + _('Previous message (by thread):') + 514 ' <A HREF="%s">%s\n</A></li>' 515 % (url_quote(self.prev.filename), 516 self.quote(subject))) 517 else: 518 prev = prev_wsubj = "" 519 return prev, prev_wsubj 520 521 def _get_subject_enc(self, art): 522 """Return the subject of art, decoded if possible. 523 524 If the charset of the current message and art match and the 525 article's subject is encoded, decode it. 526 """ 527 return art.decoded.get('subject', art.subject) 528 529 def _get_next(self): 530 """Return the href and subject for the previous message""" 531 if self.next: 532 subject = self._get_subject_enc(self.next) 533 next = ('<LINK REL="Next" HREF="%s">' 534 % (url_quote(self.next.filename))) 535 next_wsubj = ('<LI>' + _('Next message (by thread):') + 536 ' <A HREF="%s">%s\n</A></li>' 537 % (url_quote(self.next.filename), 538 self.quote(subject))) 539 else: 540 next = next_wsubj = "" 541 return next, next_wsubj 542 543 _rx_quote = re.compile('=([A-F0-9][A-F0-9])') 544 _rx_softline = re.compile('=[ \t]*$') 545 546 def _get_body(self): 547 """Return the message body ready for HTML, decoded if necessary""" 548 try: 549 body = self.html_body 550 except AttributeError: 551 body = self.body 552 return null_to_space(EMPTYSTRING.join(body)) 553 554 def _add_decoded(self, d): 555 """Add encoded-word keys to HTML output""" 556 for src, dst in (('author', 'author_html'), 557 ('email', 'email_html'), 558 ('subject', 'subject_html'), 559 ('subject', 'title')): 560 if self.decoded.has_key(src): 561 d[dst] = self.quote(self.decoded[src]) 562 563 def as_text(self): 564 d = self.__dict__.copy() 565 # We need to guarantee a valid From_ line, even if there are 566 # bososities in the headers. 567 if not d.get('fromdate', '').strip(): 568 d['fromdate'] = time.ctime(time.time()) 569 if not d.get('email', '').strip(): 570 d['email'] = 'bogus@does.not.exist.com' 571 if not d.get('datestr', '').strip(): 572 d['datestr'] = time.ctime(time.time()) 573 # 574 headers = ['From %(email)s %(fromdate)s', 575 'From: %(email)s (%(author)s)', 576 'Date: %(datestr)s', 577 'Subject: %(subject)s'] 578 if d['_in_reply_to']: 579 headers.append('In-Reply-To: %(_in_reply_to)s') 580 if d['_references']: 581 headers.append('References: %(_references)s') 582 if d['_message_id']: 583 headers.append('Message-ID: %(_message_id)s') 584 body = EMPTYSTRING.join(self.body) 585 cset = Utils.GetCharSet(self._lang) 586 # Coerce the body to Unicode and replace any invalid characters. 587 if not isinstance(body, types.UnicodeType): 588 body = unicode(body, cset, 'replace') 589 if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: 590 otrans = i18n.get_translation() 591 try: 592 i18n.set_language(self._lang) 593 atmark = unicode(_(' at '), cset) 594 body = re.sub(r'([-+,.\w]+)@([-+.\w]+)', 595 '\g<1>' + atmark + '\g<2>', body) 596 finally: 597 i18n.set_translation(otrans) 598 # Return body to character set of article. 599 body = body.encode(cset, 'replace') 600 return NL.join(headers) % d + '\n\n' + body + '\n' 601 602 def _set_date(self, message): 603 self.__super_set_date(message) 604 self.fromdate = time.ctime(int(self.date)) 605 606 def loadbody_fromHTML(self,fileobj): 607 self.body = [] 608 begin = 0 609 while 1: 610 line = fileobj.readline() 611 if not line: 612 break 613 if not begin: 614 if line.strip() == '<!--beginarticle-->': 615 begin = 1 616 continue 617 if line.strip() == '<!--endarticle-->': 618 break 619 self.body.append(line) 620 621 def finished_update_article(self): 622 self.body = [] 623 try: 624 del self.html_body 625 except AttributeError: 626 pass 627 628 629class HyperArchive(pipermail.T): 630 __super_init = pipermail.T.__init__ 631 __super_update_archive = pipermail.T.update_archive 632 __super_update_dirty_archives = pipermail.T.update_dirty_archives 633 __super_add_article = pipermail.T.add_article 634 635 # some defaults 636 DIRMODE = 02775 637 FILEMODE = 0660 638 639 VERBOSE = 0 640 DEFAULTINDEX = 'thread' 641 ARCHIVE_PERIOD = 'month' 642 643 THREADLAZY = 0 644 THREADLEVELS = 3 645 646 ALLOWHTML = 1 # "Lines between <html></html>" handled as is. 647 SHOWHTML = 0 # Eg, nuke leading whitespace in html manner. 648 IQUOTES = 1 # Italicize quoted text. 649 SHOWBR = 0 # Add <br> onto every line 650 651 def __init__(self, maillist): 652 # can't init the database while other processes are writing to it! 653 # XXX TODO- implement native locking 654 # with mailman's LockFile module for HyperDatabase.HyperDatabase 655 # 656 dir = maillist.archive_dir() 657 db = HyperDatabase.HyperDatabase(dir, maillist) 658 self.__super_init(dir, reload=1, database=db) 659 660 self.maillist = maillist 661 self._lock_file = None 662 self.lang = maillist.preferred_language 663 self.charset = Utils.GetCharSet(maillist.preferred_language) 664 665 if hasattr(self.maillist,'archive_volume_frequency'): 666 if self.maillist.archive_volume_frequency == 0: 667 self.ARCHIVE_PERIOD='year' 668 elif self.maillist.archive_volume_frequency == 2: 669 self.ARCHIVE_PERIOD='quarter' 670 elif self.maillist.archive_volume_frequency == 3: 671 self.ARCHIVE_PERIOD='week' 672 elif self.maillist.archive_volume_frequency == 4: 673 self.ARCHIVE_PERIOD='day' 674 else: 675 self.ARCHIVE_PERIOD='month' 676 677 yre = r'(?P<year>[0-9]{4,4})' 678 mre = r'(?P<month>[01][0-9])' 679 dre = r'(?P<day>[0123][0-9])' 680 self._volre = { 681 'year': '^' + yre + '$', 682 'quarter': '^' + yre + r'q(?P<quarter>[1234])$', 683 'month': '^' + yre + r'-(?P<month>[a-zA-Z]+)$', 684 'week': r'^Week-of-Mon-' + yre + mre + dre, 685 'day': '^' + yre + mre + dre + '$' 686 } 687 688 def _makeArticle(self, msg, sequence): 689 return Article(msg, sequence, 690 lang=self.maillist.preferred_language, 691 mlist=self.maillist) 692 693 def html_foot(self): 694 # avoid i18n side-effects 695 mlist = self.maillist 696 otrans = i18n.get_translation() 697 i18n.set_language(mlist.preferred_language) 698 # Convenience 699 def quotetime(s): 700 return html_quote(i18n.ctime(s), self.lang) 701 try: 702 d = {"lastdate": quotetime(self.lastdate), 703 "archivedate": quotetime(self.archivedate), 704 "listinfo": mlist.GetScriptURL('listinfo', absolute=1), 705 "version": self.version, 706 "listname": html_quote(mlist.real_name, self.lang), 707 } 708 i = {"thread": _("thread"), 709 "subject": _("subject"), 710 "author": _("author"), 711 "date": _("date") 712 } 713 finally: 714 i18n.set_translation(otrans) 715 716 for t in i.keys(): 717 cap = t[0].upper() + t[1:] 718 if self.type == cap: 719 d["%s_ref" % (t)] = "" 720 else: 721 d["%s_ref" % (t)] = ('<a href="%s.html#start">[ %s ]</a>' 722 % (t, i[t])) 723 return quick_maketext( 724 'archidxfoot.html', d, 725 mlist=mlist) 726 727 def html_head(self): 728 # avoid i18n side-effects 729 mlist = self.maillist 730 otrans = i18n.get_translation() 731 i18n.set_language(mlist.preferred_language) 732 # Convenience 733 def quotetime(s): 734 return html_quote(i18n.ctime(s), self.lang) 735 try: 736 d = {"listname": html_quote(mlist.real_name, self.lang), 737 "archtype": self.type, 738 "archive": self.volNameToDesc(self.archive), 739 "listinfo": mlist.GetScriptURL('listinfo', absolute=1), 740 "firstdate": quotetime(self.firstdate), 741 "lastdate": quotetime(self.lastdate), 742 "size": self.size, 743 } 744 i = {"thread": _("thread"), 745 "subject": _("subject"), 746 "author": _("author"), 747 "date": _("date"), 748 } 749 finally: 750 i18n.set_translation(otrans) 751 752 for t in i.keys(): 753 cap = t[0].upper() + t[1:] 754 if self.type == cap: 755 d["%s_ref" % (t)] = "" 756 d["archtype"] = i[t] 757 else: 758 d["%s_ref" % (t)] = ('<a href="%s.html#start">[ %s ]</a>' 759 % (t, i[t])) 760 if self.charset: 761 d["encoding"] = html_charset % self.charset 762 else: 763 d["encoding"] = "" 764 return quick_maketext( 765 'archidxhead.html', d, 766 mlist=mlist) 767 768 def html_TOC(self): 769 mlist = self.maillist 770 listname = mlist.internal_name() 771 mbox = os.path.join(mlist.archive_dir()+'.mbox', listname+'.mbox') 772 d = {"listname": mlist.real_name, 773 "listinfo": mlist.GetScriptURL('listinfo', absolute=1), 774 "fullarch": '../%s.mbox/%s.mbox' % (listname, listname), 775 "size": sizeof(mbox, mlist.preferred_language), 776 'meta': '', 777 } 778 # Avoid i18n side-effects 779 otrans = i18n.get_translation() 780 i18n.set_language(mlist.preferred_language) 781 try: 782 if not self.archives: 783 d["noarchive_msg"] = _( 784 '<P>Currently, there are no archives. </P>') 785 d["archive_listing_start"] = "" 786 d["archive_listing_end"] = "" 787 d["archive_listing"] = "" 788 else: 789 d["noarchive_msg"] = "" 790 d["archive_listing_start"] = quick_maketext( 791 'archliststart.html', 792 lang=mlist.preferred_language, 793 mlist=mlist) 794 d["archive_listing_end"] = quick_maketext( 795 'archlistend.html', 796 mlist=mlist) 797 798 accum = [] 799 for a in self.archives: 800 accum.append(self.html_TOC_entry(a)) 801 d["archive_listing"] = EMPTYSTRING.join(accum) 802 finally: 803 i18n.set_translation(otrans) 804 # The TOC is always in the charset of the list's preferred language 805 d['meta'] += html_charset % Utils.GetCharSet(mlist.preferred_language) 806 # The site can disable public access to the mbox file. 807 if mm_cfg.PUBLIC_MBOX: 808 template = 'archtoc.html' 809 else: 810 template = 'archtocnombox.html' 811 return quick_maketext(template, d, mlist=mlist) 812 813 def html_TOC_entry(self, arch): 814 # Check to see if the archive is gzip'd or not 815 txtfile = os.path.join(self.maillist.archive_dir(), arch + '.txt') 816 gzfile = txtfile + '.gz' 817 # which exists? .txt.gz first, then .txt 818 if os.path.exists(gzfile): 819 file = gzfile 820 url = arch + '.txt.gz' 821 templ = '<td><A href="%(url)s">[ ' + _('Gzip\'d Text%(sz)s') \ 822 + ']</a></td>' 823 elif os.path.exists(txtfile): 824 file = txtfile 825 url = arch + '.txt' 826 templ = '<td><A href="%(url)s">[ ' + _('Text%(sz)s') + ']</a></td>' 827 else: 828 # neither found? 829 file = None 830 # in Python 1.5.2 we have an easy way to get the size 831 if file: 832 textlink = templ % { 833 'url': url, 834 'sz' : sizeof(file, self.maillist.preferred_language) 835 } 836 else: 837 # there's no archive file at all... hmmm. 838 textlink = '' 839 return quick_maketext( 840 'archtocentry.html', 841 {'archive': arch, 842 'archivelabel': self.volNameToDesc(arch), 843 'textlink': textlink 844 }, 845 mlist=self.maillist) 846 847 def GetArchLock(self): 848 if self._lock_file: 849 return 1 850 self._lock_file = LockFile.LockFile( 851 os.path.join(mm_cfg.LOCK_DIR, 852 self.maillist.internal_name() + '-arch.lock')) 853 try: 854 self._lock_file.lock(timeout=0.5) 855 except LockFile.TimeOutError: 856 return 0 857 return 1 858 859 def DropArchLock(self): 860 if self._lock_file: 861 self._lock_file.unlock(unconditionally=1) 862 self._lock_file = None 863 864 def processListArch(self): 865 name = self.maillist.ArchiveFileName() 866 wname= name+'.working' 867 ename= name+'.err_unarchived' 868 try: 869 os.stat(name) 870 except (IOError,os.error): 871 #no archive file, nothin to do -ddm 872 return 873 874 #see if arch is locked here -ddm 875 if not self.GetArchLock(): 876 #another archiver is running, nothing to do. -ddm 877 return 878 879 #if the working file is still here, the archiver may have 880 # crashed during archiving. Save it, log an error, and move on. 881 try: 882 wf = open(wname) 883 syslog('error', 884 'Archive working file %s present. ' 885 'Check %s for possibly unarchived msgs', 886 wname, ename) 887 omask = os.umask(007) 888 try: 889 ef = open(ename, 'a+') 890 finally: 891 os.umask(omask) 892 ef.seek(1,2) 893 if ef.read(1) <> '\n': 894 ef.write('\n') 895 ef.write(wf.read()) 896 ef.close() 897 wf.close() 898 os.unlink(wname) 899 except IOError: 900 pass 901 os.rename(name,wname) 902 archfile = open(wname) 903 self.processUnixMailbox(archfile) 904 archfile.close() 905 os.unlink(wname) 906 self.DropArchLock() 907 908 def get_filename(self, article): 909 return '%06i.html' % (article.sequence,) 910 911 def get_archives(self, article): 912 """Return a list of indexes where the article should be filed. 913 A string can be returned if the list only contains one entry, 914 and the empty list is legal.""" 915 res = self.dateToVolName(float(article.date)) 916 self.message(C_("figuring article archives\n")) 917 self.message(res + "\n") 918 return res 919 920 def volNameToDesc(self, volname): 921 volname = volname.strip() 922 # Don't make these module global constants since we have to runtime 923 # translate them anyway. 924 monthdict = [ 925 '', 926 _('January'), _('February'), _('March'), _('April'), 927 _('May'), _('June'), _('July'), _('August'), 928 _('September'), _('October'), _('November'), _('December') 929 ] 930 for each in self._volre.keys(): 931 match = re.match(self._volre[each], volname) 932 # Let ValueErrors percolate up 933 if match: 934 year = int(match.group('year')) 935 if each == 'quarter': 936 d =["", _("First"), _("Second"), _("Third"), _("Fourth") ] 937 ord = d[int(match.group('quarter'))] 938 return _("%(ord)s quarter %(year)i") 939 elif each == 'month': 940 monthstr = match.group('month').lower() 941 for i in range(1, 13): 942 monthname = time.strftime("%B", (1999,i,1,0,0,0,0,1,0)) 943 if monthstr.lower() == monthname.lower(): 944 month = monthdict[i] 945 return _("%(month)s %(year)i") 946 raise ValueError, "%s is not a month!" % monthstr 947 elif each == 'week': 948 month = monthdict[int(match.group("month"))] 949 day = int(match.group("day")) 950 return _("The Week Of Monday %(day)i %(month)s %(year)i") 951 elif each == 'day': 952 month = monthdict[int(match.group("month"))] 953 day = int(match.group("day")) 954 return _("%(day)i %(month)s %(year)i") 955 else: 956 return match.group('year') 957 raise ValueError, "%s is not a valid volname" % volname 958 959# The following two methods should be inverses of each other. -ddm 960 961 def dateToVolName(self,date): 962 datetuple=time.localtime(date) 963 if self.ARCHIVE_PERIOD=='year': 964 return time.strftime("%Y",datetuple) 965 elif self.ARCHIVE_PERIOD=='quarter': 966 if datetuple[1] in [1,2,3]: 967 return time.strftime("%Yq1",datetuple) 968 elif datetuple[1] in [4,5,6]: 969 return time.strftime("%Yq2",datetuple) 970 elif datetuple[1] in [7,8,9]: 971 return time.strftime("%Yq3",datetuple) 972 else: 973 return time.strftime("%Yq4",datetuple) 974 elif self.ARCHIVE_PERIOD == 'day': 975 return time.strftime("%Y%m%d", datetuple) 976 elif self.ARCHIVE_PERIOD == 'week': 977 # Reconstruct "seconds since epoch", and subtract weekday 978 # multiplied by the number of seconds in a day. 979 monday = time.mktime(datetuple) - datetuple[6] * 24 * 60 * 60 980 # Build a new datetuple from this "seconds since epoch" value 981 datetuple = time.localtime(monday) 982 return time.strftime("Week-of-Mon-%Y%m%d", datetuple) 983 # month. -ddm 984 else: 985 return time.strftime("%Y-%B",datetuple) 986 987 988 def volNameToDate(self, volname): 989 volname = volname.strip() 990 for each in self._volre.keys(): 991 match = re.match(self._volre[each],volname) 992 if match: 993 year = int(match.group('year')) 994 month = 1 995 day = 1 996 if each == 'quarter': 997 q = int(match.group('quarter')) 998 month = (q * 3) - 2 999 elif each == 'month': 1000 monthstr = match.group('month').lower() 1001 m = [] 1002 for i in range(1,13): 1003 m.append( 1004 time.strftime("%B",(1999,i,1,0,0,0,0,1,0)).lower()) 1005 try: 1006 month = m.index(monthstr) + 1 1007 except ValueError: 1008 pass 1009 elif each == 'week' or each == 'day': 1010 month = int(match.group("month")) 1011 day = int(match.group("day")) 1012 try: 1013 return time.mktime((year,month,1,0,0,0,0,1,-1)) 1014 except OverflowError: 1015 return 0.0 1016 return 0.0 1017 1018 def sortarchives(self): 1019 def sf(a, b): 1020 al = self.volNameToDate(a) 1021 bl = self.volNameToDate(b) 1022 if al > bl: 1023 return 1 1024 elif al < bl: 1025 return -1 1026 else: 1027 return 0 1028 if self.ARCHIVE_PERIOD in ('month','year','quarter'): 1029 self.archives.sort(sf) 1030 else: 1031 self.archives.sort() 1032 self.archives.reverse() 1033 1034 def message(self, msg): 1035 if self.VERBOSE: 1036 f = sys.stderr 1037 f.write(msg) 1038 if msg[-1:] != '\n': 1039 f.write('\n') 1040 f.flush() 1041 1042 def open_new_archive(self, archive, archivedir): 1043 index_html = os.path.join(archivedir, 'index.html') 1044 try: 1045 os.unlink(index_html) 1046 except: 1047 pass 1048 os.symlink(self.DEFAULTINDEX+'.html',index_html) 1049 1050 def write_index_header(self): 1051 self.depth=0 1052 print self.html_head() 1053 if not self.THREADLAZY and self.type=='Thread': 1054 self.message(C_("Computing threaded index\n")) 1055 self.updateThreadedIndex() 1056 1057 def write_index_footer(self): 1058 for i in range(self.depth): 1059 print '</UL>' 1060 print self.html_foot() 1061 1062 def write_index_entry(self, article): 1063 subject = self.get_header("subject", article) 1064 author = self.get_header("author", article) 1065 if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: 1066 try: 1067 author = re.sub('@', _(' at '), author) 1068 except UnicodeError: 1069 # Non-ASCII author contains '@' ... no valid email anyway 1070 pass 1071 subject = CGIescape(subject, self.lang) 1072 author = CGIescape(author, self.lang) 1073 1074 d = { 1075 'filename': urllib.quote(article.filename), 1076 'subject': subject, 1077 'sequence': article.sequence, 1078 'author': author 1079 } 1080 print quick_maketext( 1081 'archidxentry.html', d, 1082 mlist=self.maillist) 1083 1084 def get_header(self, field, article): 1085 # if we have no decoded header, return the encoded one 1086 result = article.decoded.get(field) 1087 if result is None: 1088 return getattr(article, field) 1089 # otherwise, the decoded one will be Unicode 1090 return result 1091 1092 def write_threadindex_entry(self, article, depth): 1093 if depth < 0: 1094 self.message('depth<0') 1095 depth = 0 1096 if depth > self.THREADLEVELS: 1097 depth = self.THREADLEVELS 1098 if depth < self.depth: 1099 for i in range(self.depth-depth): 1100 print '</UL>' 1101 elif depth > self.depth: 1102 for i in range(depth-self.depth): 1103 print '<UL>' 1104 print '<!--%i %s -->' % (depth, article.threadKey) 1105 self.depth = depth 1106 self.write_index_entry(article) 1107 1108 def write_TOC(self): 1109 self.sortarchives() 1110 omask = os.umask(002) 1111 try: 1112 toc = open(os.path.join(self.basedir, 'index.html'), 'w') 1113 finally: 1114 os.umask(omask) 1115 toc.write(self.html_TOC()) 1116 toc.close() 1117 1118 def write_article(self, index, article, path): 1119 # called by add_article 1120 omask = os.umask(002) 1121 try: 1122 f = open(path, 'w') 1123 finally: 1124 os.umask(omask) 1125 f.write(article.as_html()) 1126 f.close() 1127 1128 # Write the text article to the text archive. 1129 path = os.path.join(self.basedir, "%s.txt" % index) 1130 omask = os.umask(002) 1131 try: 1132 f = open(path, 'a+') 1133 finally: 1134 os.umask(omask) 1135 f.write(article.as_text()) 1136 f.close() 1137 1138 def update_archive(self, archive): 1139 self.__super_update_archive(archive) 1140 # only do this if the gzip module was imported globally, and 1141 # gzip'ing was enabled via mm_cfg.GZIP_ARCHIVE_TXT_FILES. See 1142 # above. 1143 if gzip: 1144 archz = None 1145 archt = None 1146 txtfile = os.path.join(self.basedir, '%s.txt' % archive) 1147 gzipfile = os.path.join(self.basedir, '%s.txt.gz' % archive) 1148 oldgzip = os.path.join(self.basedir, '%s.old.txt.gz' % archive) 1149 try: 1150 # open the plain text file 1151 archt = open(txtfile) 1152 except IOError: 1153 return 1154 try: 1155 os.rename(gzipfile, oldgzip) 1156 archz = gzip.open(oldgzip) 1157 except (IOError, RuntimeError, os.error): 1158 pass 1159 try: 1160 ou = os.umask(002) 1161 newz = gzip.open(gzipfile, 'w') 1162 finally: 1163 # XXX why is this a finally? 1164 os.umask(ou) 1165 if archz: 1166 newz.write(archz.read()) 1167 archz.close() 1168 os.unlink(oldgzip) 1169 # XXX do we really need all this in a try/except? 1170 try: 1171 newz.write(archt.read()) 1172 newz.close() 1173 archt.close() 1174 except IOError: 1175 pass 1176 os.unlink(txtfile) 1177 1178 _skip_attrs = ('maillist', '_lock_file', 'charset') 1179 1180 def getstate(self): 1181 d={} 1182 for each in self.__dict__.keys(): 1183 if not (each in self._skip_attrs 1184 or each.upper() == each): 1185 d[each] = self.__dict__[each] 1186 return d 1187 1188 # Add <A HREF="..."> tags around URLs and e-mail addresses. 1189 1190 def __processbody_URLquote(self, lines): 1191 # XXX a lot to do here: 1192 # 1. use lines directly, rather than source and dest 1193 # 2. make it clearer 1194 # 3. make it faster 1195 # TK: Prepare for unicode obscure. 1196 atmark = _(' at ') 1197 if lines and isinstance(lines[0], types.UnicodeType): 1198 atmark = unicode(atmark, Utils.GetCharSet(self.lang), 'replace') 1199 source = lines[:] 1200 dest = lines 1201 last_line_was_quoted = 0 1202 for i in xrange(0, len(source)): 1203 Lorig = L = source[i] 1204 prefix = suffix = "" 1205 if L is None: 1206 continue 1207 # Italicise quoted text 1208 if self.IQUOTES: 1209 quoted = quotedpat.match(L) 1210 if quoted is None: 1211 last_line_was_quoted = 0 1212 else: 1213 quoted = quoted.end(0) 1214 prefix = CGIescape(L[:quoted], self.lang) + '<i>' 1215 suffix = '</I>' 1216 if self.SHOWHTML: 1217 suffix += '<BR>' 1218 if not last_line_was_quoted: 1219 prefix = '<BR>' + prefix 1220 L = L[quoted:] 1221 last_line_was_quoted = 1 1222 # Check for an e-mail address 1223 L2 = "" 1224 jr = emailpat.search(L) 1225 kr = urlpat.search(L) 1226 while jr is not None or kr is not None: 1227 if jr == None: 1228 j = -1 1229 else: 1230 j = jr.start(0) 1231 if kr is None: 1232 k = -1 1233 else: 1234 k = kr.start(0) 1235 if j != -1 and (j < k or k == -1): 1236 text = jr.group(1) 1237 length = len(text) 1238 if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: 1239 text = re.sub('@', atmark, text) 1240 URL = self.maillist.GetScriptURL( 1241 'listinfo', absolute=1) 1242 else: 1243 URL = 'mailto:' + text 1244 pos = j 1245 elif k != -1 and (j > k or j == -1): 1246 text = URL = kr.group(1) 1247 length = len(text) 1248 pos = k 1249 else: # j==k 1250 raise ValueError, "j==k: This can't happen!" 1251 #length = len(text) 1252 #self.message("URL: %s %s %s \n" 1253 # % (CGIescape(L[:pos]), URL, CGIescape(text))) 1254 L2 += '%s<A HREF="%s">%s</A>' % ( 1255 CGIescape(L[:pos], self.lang), 1256 html_quote(URL), CGIescape(text, self.lang)) 1257 L = L[pos+length:] 1258 jr = emailpat.search(L) 1259 kr = urlpat.search(L) 1260 if jr is None and kr is None: 1261 L = CGIescape(L, self.lang) 1262 L = prefix + L2 + L + suffix 1263 source[i] = None 1264 dest[i] = L 1265 1266 # Perform Hypermail-style processing of <HTML></HTML> directives 1267 # in message bodies. Lines between <HTML> and </HTML> will be written 1268 # out precisely as they are; other lines will be passed to func2 1269 # for further processing . 1270 1271 def __processbody_HTML(self, lines): 1272 # XXX need to make this method modify in place 1273 source = lines[:] 1274 dest = lines 1275 l = len(source) 1276 i = 0 1277 while i < l: 1278 while i < l and htmlpat.match(source[i]) is None: 1279 i = i + 1 1280 if i < l: 1281 source[i] = None 1282 i = i + 1 1283 while i < l and nohtmlpat.match(source[i]) is None: 1284 dest[i], source[i] = source[i], None 1285 i = i + 1 1286 if i < l: 1287 source[i] = None 1288 i = i + 1 1289 1290 def format_article(self, article): 1291 # called from add_article 1292 # TBD: Why do the HTML formatting here and keep it in the 1293 # pipermail database? It makes more sense to do the html 1294 # formatting as the article is being written as html and toss 1295 # the data after it has been written to the archive file. 1296 lines = filter(None, article.body) 1297 # Handle <HTML> </HTML> directives 1298 if self.ALLOWHTML: 1299 self.__processbody_HTML(lines) 1300 self.__processbody_URLquote(lines) 1301 if not self.SHOWHTML and lines: 1302 lines.insert(0, '<PRE>') 1303 lines.append('</PRE>') 1304 else: 1305 # Do fancy formatting here 1306 if self.SHOWBR: 1307 lines = map(lambda x:x + "<BR>", lines) 1308 else: 1309 for i in range(0, len(lines)): 1310 s = lines[i] 1311 if s[0:1] in ' \t\n': 1312 lines[i] = '<P>' + s 1313 article.html_body = lines 1314 return article 1315 1316 def update_article(self, arcdir, article, prev, next): 1317 seq = article.sequence 1318 filename = os.path.join(arcdir, article.filename) 1319 self.message(C_('Updating HTML for article %(seq)s')) 1320 try: 1321 f = open(filename) 1322 article.loadbody_fromHTML(f) 1323 f.close() 1324 except IOError, e: 1325 if e.errno <> errno.ENOENT: raise 1326 self.message(C_('article file %(filename)s is missing!')) 1327 article.prev = prev 1328 article.next = next 1329 omask = os.umask(002) 1330 try: 1331 f = open(filename, 'w') 1332 finally: 1333 os.umask(omask) 1334 f.write(article.as_html()) 1335 f.close() 1336