1# Copyright (C) 1998-2018 by the Free Software Foundation, Inc.
2#
3# This program is free software; you can redistribute it and/or
4# modify it under the terms of the GNU General Public License
5# as published by the Free Software Foundation; either version 2
6# of the License, or (at your option) any later version.
7#
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU General Public License for more details.
12#
13# You should have received a copy of the GNU General Public License
14# along with this program; if not, write to the Free Software
15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
16# USA.
17
18"""Cook a message's Subject header.
19Also do other manipulations of From:, Reply-To: and Cc: depending on
20list configuration.
21"""
22
23from __future__ import nested_scopes
24import re
25from types import UnicodeType
26
27from email.Charset import Charset
28from email.Header import Header, decode_header, make_header
29from email.Utils import parseaddr, formataddr, getaddresses
30from email.Errors import HeaderParseError
31
32from Mailman import i18n
33from Mailman import mm_cfg
34from Mailman import Utils
35from Mailman.i18n import _
36from Mailman.Logging.Syslog import syslog
37
38CONTINUATION = ',\n '
39COMMASPACE = ', '
40MAXLINELEN = 78
41
42# True/False
43try:
44    True, False
45except NameError:
46    True = 1
47    False = 0
48
49
50
51def _isunicode(s):
52    return isinstance(s, UnicodeType)
53
54nonascii = re.compile('[^\s!-~]')
55
56def uheader(mlist, s, header_name=None, continuation_ws=' ', maxlinelen=None):
57    # Get the charset to encode the string in. Then search if there is any
58    # non-ascii character is in the string. If there is and the charset is
59    # us-ascii then we use iso-8859-1 instead. If the string is ascii only
60    # we use 'us-ascii' if another charset is specified.
61    charset = Utils.GetCharSet(mlist.preferred_language)
62    if nonascii.search(s):
63        # use list charset but ...
64        if charset == 'us-ascii':
65            charset = 'iso-8859-1'
66    else:
67        # there is no nonascii so ...
68        charset = 'us-ascii'
69    try:
70        return Header(s, charset, maxlinelen, header_name, continuation_ws)
71    except UnicodeError:
72        syslog('error', 'list: %s: can\'t decode "%s" as %s',
73               mlist.internal_name(), s, charset)
74        return Header('', charset, maxlinelen, header_name, continuation_ws)
75
76def change_header(name, value, mlist, msg, msgdata, delete=True, repl=True):
77    if ((msgdata.get('from_is_list') == 2 or
78        (msgdata.get('from_is_list') == 0 and mlist.from_is_list == 2)) and
79        not msgdata.get('_fasttrack')
80       ) or name.lower() in ('from', 'reply-to', 'cc'):
81        # The or name.lower() in ... above is because when we are munging
82        # the From:, we want to defer the resultant changes to From:,
83        # Reply-To:, and/or Cc: until after the message passes through
84        # ToDigest, ToArchive and ToUsenet.  Thus, we put them in
85        # msgdata[add_header] here and apply them in WrapMessage.
86        msgdata.setdefault('add_header', {})[name] = value
87    elif repl or not msg.has_key(name):
88        if delete:
89            del msg[name]
90        msg[name] = value
91
92
93
94def process(mlist, msg, msgdata):
95    # Set the "X-Ack: no" header if noack flag is set.
96    if msgdata.get('noack'):
97        change_header('X-Ack', 'no', mlist, msg, msgdata)
98    # Because we're going to modify various important headers in the email
99    # message, we want to save some of the information in the msgdata
100    # dictionary for later.  Specifically, the sender header will get waxed,
101    # but we need it for the Acknowledge module later.
102    # We may have already saved it; if so, don't clobber it here.
103    if 'original_sender' not in msgdata:
104        msgdata['original_sender'] = msg.get_sender()
105    # VirginRunner sets _fasttrack for internally crafted messages.
106    fasttrack = msgdata.get('_fasttrack')
107    if not msgdata.get('isdigest') and not fasttrack:
108        try:
109            prefix_subject(mlist, msg, msgdata)
110        except (UnicodeError, ValueError):
111            # TK: Sometimes subject header is not MIME encoded for 8bit
112            # simply abort prefixing.
113            pass
114    # Mark message so we know we've been here, but leave any existing
115    # X-BeenThere's intact.
116    change_header('X-BeenThere', mlist.GetListEmail(),
117                  mlist, msg, msgdata, delete=False)
118    # Add Precedence: and other useful headers.  None of these are standard
119    # and finding information on some of them are fairly difficult.  Some are
120    # just common practice, and we'll add more here as they become necessary.
121    # Good places to look are:
122    #
123    # http://www.dsv.su.se/~jpalme/ietf/jp-ietf-home.html
124    # http://www.faqs.org/rfcs/rfc2076.html
125    #
126    # None of these headers are added if they already exist.  BAW: some
127    # consider the advertising of this a security breach.  I.e. if there are
128    # known exploits in a particular version of Mailman and we know a site is
129    # using such an old version, they may be vulnerable.  It's too easy to
130    # edit the code to add a configuration variable to handle this.
131    change_header('X-Mailman-Version', mm_cfg.VERSION,
132                  mlist, msg, msgdata, repl=False)
133    # We set "Precedence: list" because this is the recommendation from the
134    # sendmail docs, the most authoritative source of this header's semantics.
135    change_header('Precedence', 'list',
136                  mlist, msg, msgdata, repl=False)
137    # Do we change the from so the list takes ownership of the email
138    if (msgdata.get('from_is_list') or mlist.from_is_list) and not fasttrack:
139        # Be as robust as possible here.
140        faddrs = getaddresses(msg.get_all('from', []))
141        # Strip the nulls and bad emails.
142        faddrs = [x for x in faddrs if x[1].find('@') > 0]
143        if len(faddrs) == 1:
144            realname, email = o_from = faddrs[0]
145        else:
146            # No From: or multiple addresses.  Just punt and take
147            # the get_sender result.
148            realname = ''
149            email = msgdata['original_sender']
150            o_from = (realname, email)
151        if not realname:
152            if mlist.isMember(email):
153                realname = mlist.getMemberName(email) or email
154            else:
155                realname = email
156        # Remove domain from realname if it looks like an email address
157        realname = re.sub(r'@([^ .]+\.)+[^ .]+$', '---', realname)
158        # Make a display name and RFC 2047 encode it if necessary.  This is
159        # difficult and kludgy. If the realname came from From: it should be
160        # ascii or RFC 2047 encoded. If it came from the list, it should be
161        # in the charset of the list's preferred language or possibly unicode.
162        # if it's from the email address, it should be ascii. In any case,
163        # make it a unicode.
164        if isinstance(realname, unicode):
165            urn = realname
166        else:
167            rn, cs = ch_oneline(realname)
168            urn = unicode(rn, cs, errors='replace')
169        # likewise, the list's real_name which should be ascii, but use the
170        # charset of the list's preferred_language which should be a superset.
171        lcs = Utils.GetCharSet(mlist.preferred_language)
172        ulrn = unicode(mlist.real_name, lcs, errors='replace')
173        # get translated 'via' with dummy replacements
174        realname = '%(realname)s'
175        lrn = '%(lrn)s'
176        # We want the i18n context to be the list's preferred_language.  It
177        # could be the poster's.
178        otrans = i18n.get_translation()
179        i18n.set_language(mlist.preferred_language)
180        via = _('%(realname)s via %(lrn)s')
181        i18n.set_translation(otrans)
182        uvia = unicode(via, lcs, errors='replace')
183        # Replace the dummy replacements.
184        uvia = re.sub(u'%\(lrn\)s', ulrn, re.sub(u'%\(realname\)s', urn, uvia))
185        # And get an RFC 2047 encoded header string.
186        dn = str(Header(uvia, lcs))
187        change_header('From',
188                      formataddr((dn, mlist.GetListEmail())),
189                      mlist, msg, msgdata)
190    else:
191        # Use this as a flag
192        o_from = None
193    # Reply-To: munging.  Do not do this if the message is "fast tracked",
194    # meaning it is internally crafted and delivered to a specific user.  BAW:
195    # Yuck, I really hate this feature but I've caved under the sheer pressure
196    # of the (very vocal) folks want it.  OTOH, RFC 2822 allows Reply-To: to
197    # be a list of addresses, so instead of replacing the original, simply
198    # augment it.  RFC 2822 allows max one Reply-To: header so collapse them
199    # if we're adding a value, otherwise don't touch it.  (Should we collapse
200    # in all cases?)
201    # MAS: We need to do some things with the original From: if we've munged
202    # it for DMARC mitigation.  We have goals for this process which are
203    # not completely compatible, so we do the best we can.  Our goals are:
204    # 1) as long as the list is not anonymous, the original From: address
205    #    should be obviously exposed, i.e. not just in a header that MUAs
206    #    don't display.
207    # 2) the original From: address should not be in a comment or display
208    #    name in the new From: because it is claimed that multiple domains
209    #    in any fields in From: are indicative of spamminess.  This means
210    #    it should be in Reply-To: or Cc:.
211    # 3) the behavior of an MUA doing a 'reply' or 'reply all' should be
212    #    consistent regardless of whether or not the From: is munged.
213    # Goal 3) implies sometimes the original From: should be in Reply-To:
214    # and sometimes in Cc:, and even so, this goal won't be achieved in
215    # all cases with all MUAs.  In cases of conflict, the above ordering of
216    # goals is priority order.
217
218    if not fasttrack:
219        # A convenience function, requires nested scopes.  pair is (name, addr)
220        new = []
221        d = {}
222        def add(pair):
223            lcaddr = pair[1].lower()
224            if d.has_key(lcaddr):
225                return
226            d[lcaddr] = pair
227            new.append(pair)
228        # List admin wants an explicit Reply-To: added
229        if mlist.reply_goes_to_list == 2:
230            add(parseaddr(mlist.reply_to_address))
231        # If we're not first stripping existing Reply-To: then we need to add
232        # the original Reply-To:'s to the list we're building up.  In both
233        # cases we'll zap the existing field because RFC 2822 says max one is
234        # allowed.
235        o_rt = False
236        if not mlist.first_strip_reply_to:
237            orig = msg.get_all('reply-to', [])
238            for pair in getaddresses(orig):
239                # There's an original Reply-To: and we're not removing it.
240                add(pair)
241                o_rt = True
242        # We also need to put the old From: in Reply-To: in all cases where
243        # it is not going in Cc:.  This is when reply_goes_to_list == 0 and
244        # either there was no original Reply-To: or we stripped it.
245        # However, if there was an original Reply-To:, unstripped, and it
246        # contained the original From: address we need to flag that it's
247        # there so we don't add the original From: to Cc:
248        if o_from and mlist.reply_goes_to_list == 0:
249            if o_rt:
250                if d.has_key(o_from[1].lower()):
251                    # Original From: address is in original Reply-To:.
252                    # Pretend we added it.
253                    o_from = None
254            else:
255                add(o_from)
256                # Flag that we added it.
257                o_from = None
258        # Set Reply-To: header to point back to this list.  Add this last
259        # because some folks think that some MUAs make it easier to delete
260        # addresses from the right than from the left.
261        if mlist.reply_goes_to_list == 1:
262            i18ndesc = uheader(mlist, mlist.description, 'Reply-To')
263            add((str(i18ndesc), mlist.GetListEmail()))
264        # Don't put Reply-To: back if there's nothing to add!
265        if new:
266            # Preserve order
267            change_header('Reply-To',
268                          COMMASPACE.join([formataddr(pair) for pair in new]),
269                          mlist, msg, msgdata)
270        else:
271            del msg['reply-to']
272        # The To field normally contains the list posting address.  However
273        # when messages are fully personalized, that header will get
274        # overwritten with the address of the recipient.  We need to get the
275        # posting address in one of the recipient headers or they won't be
276        # able to reply back to the list.  It's possible the posting address
277        # was munged into the Reply-To header, but if not, we'll add it to a
278        # Cc header.  BAW: should we force it into a Reply-To header in the
279        # above code?
280        # Also skip Cc if this is an anonymous list as list posting address
281        # is already in From and Reply-To in this case.
282        # We do add the Cc in cases where From: header munging is being done
283        # because even though the list address is in From:, the Reply-To:
284        # poster will override it. Brain dead MUAs may then address the list
285        # twice on a 'reply all', but reasonable MUAs should do the right
286        # thing.  We also add the original From: to Cc: if it wasn't added
287        # to Reply-To:
288        add_list = (mlist.personalize == 2 and
289                    mlist.reply_goes_to_list <> 1 and
290                    not mlist.anonymous_list)
291        if add_list or o_from:
292            # Watch out for existing Cc headers, merge, and remove dups.  Note
293            # that RFC 2822 says only zero or one Cc header is allowed.
294            new = []
295            d = {}
296            # If we're adding the original From:, add it first.
297            if o_from:
298                add(o_from)
299            # AvoidDuplicates may have set a new Cc: in msgdata.add_header,
300            # so check that.
301            if (msgdata.has_key('add_header') and
302                    msgdata['add_header'].has_key('Cc')):
303                for pair in getaddresses([msgdata['add_header']['Cc']]):
304                    add(pair)
305            else:
306                for pair in getaddresses(msg.get_all('cc', [])):
307                    add(pair)
308            if add_list:
309                i18ndesc = uheader(mlist, mlist.description, 'Cc')
310                add((str(i18ndesc), mlist.GetListEmail()))
311            change_header('Cc',
312                          COMMASPACE.join([formataddr(pair) for pair in new]),
313                          mlist, msg, msgdata)
314    # Add list-specific headers as defined in RFC 2369 and RFC 2919, but only
315    # if the message is being crafted for a specific list (e.g. not for the
316    # password reminders).
317    #
318    # BAW: Some people really hate the List-* headers.  It seems that the free
319    # version of Eudora (possibly on for some platforms) does not hide these
320    # headers by default, pissing off their users.  Too bad.  Fix the MUAs.
321    if msgdata.get('_nolist') or not mlist.include_rfc2369_headers:
322        return
323    # This will act like an email address for purposes of formataddr()
324    listid = '%s.%s' % (mlist.internal_name(), mlist.host_name)
325    cset = Utils.GetCharSet(mlist.preferred_language)
326    if mlist.description:
327        # Don't wrap the header since here we just want to get it properly RFC
328        # 2047 encoded.
329        i18ndesc = uheader(mlist, mlist.description, 'List-Id', maxlinelen=998)
330        listid_h = formataddr((str(i18ndesc), listid))
331        # With some charsets (utf-8?) and some invalid chars, str(18ndesc) can
332        # be empty.
333        if str(i18ndesc):
334            listid_h = formataddr((str(i18ndesc), listid))
335        else:
336            listid_h = '<%s>' % listid
337    else:
338        # without desc we need to ensure the MUST brackets
339        listid_h = '<%s>' % listid
340    # We always add a List-ID: header.
341    change_header('List-Id', listid_h, mlist, msg, msgdata)
342    # For internally crafted messages, we also add a (nonstandard),
343    # "X-List-Administrivia: yes" header.  For all others (i.e. those coming
344    # from list posts), we add a bunch of other RFC 2369 headers.
345    requestaddr = mlist.GetRequestEmail()
346    subfieldfmt = '<%s>, <mailto:%s?subject=%ssubscribe>'
347    listinfo = mlist.GetScriptURL('listinfo', absolute=1)
348    useropts = mlist.GetScriptURL('options', absolute=1)
349    headers = {}
350    if msgdata.get('reduced_list_headers'):
351        headers['X-List-Administrivia'] = 'yes'
352    else:
353        headers.update({
354            'List-Help'       : '<mailto:%s?subject=help>' % requestaddr,
355            'List-Unsubscribe': subfieldfmt % (useropts, requestaddr, 'un'),
356            'List-Subscribe'  : subfieldfmt % (listinfo, requestaddr, ''),
357            })
358        # List-Post: is controlled by a separate attribute
359        if mlist.include_list_post_header:
360            headers['List-Post'] = '<mailto:%s>' % mlist.GetListEmail()
361        # Add this header if we're archiving
362        if mlist.archive:
363            archiveurl = mlist.GetBaseArchiveURL()
364            headers['List-Archive'] = '<%s>' % archiveurl
365    # First we delete any pre-existing headers because the RFC permits only
366    # one copy of each, and we want to be sure it's ours.
367    for h, v in headers.items():
368        # Wrap these lines if they are too long.  78 character width probably
369        # shouldn't be hardcoded, but is at least text-MUA friendly.  The
370        # adding of 2 is for the colon-space separator.
371        if len(h) + 2 + len(v) > 78:
372            v = CONTINUATION.join(v.split(', '))
373        change_header(h, v, mlist, msg, msgdata)
374
375
376
377def prefix_subject(mlist, msg, msgdata):
378    # Add the subject prefix unless the message is a digest or is being fast
379    # tracked (e.g. internally crafted, delivered to a single user such as the
380    # list admin).
381    prefix = mlist.subject_prefix.strip()
382    if not prefix:
383        return
384    subject = msg.get('subject', '')
385    # Try to figure out what the continuation_ws is for the header
386    if isinstance(subject, Header):
387        lines = str(subject).splitlines()
388    else:
389        lines = subject.splitlines()
390    ws = ' '
391    if len(lines) > 1 and lines[1] and lines[1][0] in ' \t':
392        ws = lines[1][0]
393    msgdata['origsubj'] = subject
394    # The subject may be multilingual but we take the first charset as major
395    # one and try to decode.  If it is decodable, returned subject is in one
396    # line and cset is properly set.  If fail, subject is mime-encoded and
397    # cset is set as us-ascii.  See detail for ch_oneline() (CookHeaders one
398    # line function).
399    subject, cset = ch_oneline(subject)
400    # TK: Python interpreter has evolved to be strict on ascii charset code
401    # range.  It is safe to use unicode string when manupilating header
402    # contents with re module.  It would be best to return unicode in
403    # ch_oneline() but here is temporary solution.
404    subject = unicode(subject, cset)
405    # If the subject_prefix contains '%d', it is replaced with the
406    # mailing list sequential number.  Sequential number format allows
407    # '%d' or '%05d' like pattern.
408    prefix_pattern = re.escape(prefix)
409    # unescape '%' :-<
410    prefix_pattern = '%'.join(prefix_pattern.split(r'\%'))
411    p = re.compile('%\d*d')
412    if p.search(prefix, 1):
413        # prefix have number, so we should search prefix w/number in subject.
414        # Also, force new style.
415        prefix_pattern = p.sub(r'\s*\d+\s*', prefix_pattern)
416        old_style = False
417    else:
418        old_style = mm_cfg.OLD_STYLE_PREFIXING
419    subject = re.sub(prefix_pattern, '', subject)
420    # Previously the following re didn't have the first \s*. It would fail
421    # if the incoming Subject: was like '[prefix] Re: Re: Re:' because of the
422    # leading space after stripping the prefix. It is not known what MUA would
423    # create such a Subject:, but the issue was reported.
424    rematch = re.match(
425                       '(\s*(RE|AW|SV|VS)\s*(\[\d+\])?\s*:\s*)+',
426                        subject, re.I)
427    if rematch:
428        subject = subject[rematch.end():]
429        recolon = 'Re:'
430    else:
431        recolon = ''
432    # Strip leading and trailing whitespace from subject.
433    subject = subject.strip()
434    # At this point, subject may become null if someone post mail with
435    # Subject: [subject prefix]
436    if subject == '':
437        # We want the i18n context to be the list's preferred_language.  It
438        # could be the poster's.
439        otrans = i18n.get_translation()
440        i18n.set_language(mlist.preferred_language)
441        subject = _('(no subject)')
442        i18n.set_translation(otrans)
443        cset = Utils.GetCharSet(mlist.preferred_language)
444        subject = unicode(subject, cset)
445    # and substitute %d in prefix with post_id
446    try:
447        prefix = prefix % mlist.post_id
448    except TypeError:
449        pass
450    # If charset is 'us-ascii', try to concatnate as string because there
451    # is some weirdness in Header module (TK)
452    if cset == 'us-ascii':
453        try:
454            if old_style:
455                h = u' '.join([recolon, prefix, subject])
456            else:
457                if recolon:
458                    h = u' '.join([prefix, recolon, subject])
459                else:
460                    h = u' '.join([prefix, subject])
461            h = h.encode('us-ascii')
462            h = uheader(mlist, h, 'Subject', continuation_ws=ws)
463            change_header('Subject', h, mlist, msg, msgdata)
464            ss = u' '.join([recolon, subject])
465            ss = ss.encode('us-ascii')
466            ss = uheader(mlist, ss, 'Subject', continuation_ws=ws)
467            msgdata['stripped_subject'] = ss
468            return
469        except UnicodeError:
470            pass
471    # Get the header as a Header instance, with proper unicode conversion
472    # Because of rfc2047 encoding, spaces between encoded words can be
473    # insignificant, so we need to append spaces to our encoded stuff.
474    prefix += ' '
475    if recolon:
476        recolon += ' '
477    if old_style:
478        h = uheader(mlist, recolon, 'Subject', continuation_ws=ws)
479        h.append(prefix)
480    else:
481        h = uheader(mlist, prefix, 'Subject', continuation_ws=ws)
482        h.append(recolon)
483    # TK: Subject is concatenated and unicode string.
484    subject = subject.encode(cset, 'replace')
485    h.append(subject, cset)
486    change_header('Subject', h, mlist, msg, msgdata)
487    ss = uheader(mlist, recolon, 'Subject', continuation_ws=ws)
488    ss.append(subject, cset)
489    msgdata['stripped_subject'] = ss
490
491
492
493def ch_oneline(headerstr):
494    # Decode header string in one line and convert into single charset
495    # copied and modified from ToDigest.py and Utils.py
496    # return (string, cset) tuple as check for failure
497    try:
498        d = decode_header(headerstr)
499        # at this point, we should rstrip() every string because some
500        # MUA deliberately add trailing spaces when composing return
501        # message.
502        d = [(s.rstrip(), c) for (s,c) in d]
503        cset = 'us-ascii'
504        for x in d:
505            # search for no-None charset
506            if x[1]:
507                cset = x[1]
508                break
509        h = make_header(d)
510        ustr = h.__unicode__()
511        oneline = u''.join(ustr.splitlines())
512        return oneline.encode(cset, 'replace'), cset
513    except (LookupError, UnicodeError, ValueError, HeaderParseError):
514        # possibly charset problem. return with undecoded string in one line.
515        return ''.join(headerstr.splitlines()), 'us-ascii'
516