1"""Header value parser implementing various email-related RFC parsing rules.
2
3The parsing methods defined in this module implement various email related
4parsing rules.  Principal among them is RFC 5322, which is the followon
5to RFC 2822 and primarily a clarification of the former.  It also implements
6RFC 2047 encoded word decoding.
7
8RFC 5322 goes to considerable trouble to maintain backward compatibility with
9RFC 822 in the parse phase, while cleaning up the structure on the generation
10phase.  This parser supports correct RFC 5322 generation by tagging white space
11as folding white space only when folding is allowed in the non-obsolete rule
12sets.  Actually, the parser is even more generous when accepting input than RFC
135322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
14Where possible deviations from the standard are annotated on the 'defects'
15attribute of tokens that deviate.
16
17The general structure of the parser follows RFC 5322, and uses its terminology
18where there is a direct correspondence.  Where the implementation requires a
19somewhat different structure than that used by the formal grammar, new terms
20that mimic the closest existing terms are used.  Thus, it really helps to have
21a copy of RFC 5322 handy when studying this code.
22
23Input to the parser is a string that has already been unfolded according to
24RFC 5322 rules.  According to the RFC this unfolding is the very first step, and
25this parser leaves the unfolding step to a higher level message parser, which
26will have already detected the line breaks that need unfolding while
27determining the beginning and end of each header.
28
29The output of the parser is a TokenList object, which is a list subclass.  A
30TokenList is a recursive data structure.  The terminal nodes of the structure
31are Terminal objects, which are subclasses of str.  These do not correspond
32directly to terminal objects in the formal grammar, but are instead more
33practical higher level combinations of true terminals.
34
35All TokenList and Terminal objects have a 'value' attribute, which produces the
36semantically meaningful value of that part of the parse subtree.  The value of
37all whitespace tokens (no matter how many sub-tokens they may contain) is a
38single space, as per the RFC rules.  This includes 'CFWS', which is herein
39included in the general class of whitespace tokens.  There is one exception to
40the rule that whitespace tokens are collapsed into single spaces in values: in
41the value of a 'bare-quoted-string' (a quoted-string with no leading or
42trailing whitespace), any whitespace that appeared between the quotation marks
43is preserved in the returned value.  Note that in all Terminal strings quoted
44pairs are turned into their unquoted values.
45
46All TokenList and Terminal objects also have a string value, which attempts to
47be a "canonical" representation of the RFC-compliant form of the substring that
48produced the parsed subtree, including minimal use of quoted pair quoting.
49Whitespace runs are not collapsed.
50
51Comment tokens also have a 'content' attribute providing the string found
52between the parens (including any nested comments) with whitespace preserved.
53
54All TokenList and Terminal objects have a 'defects' attribute which is a
55possibly empty list all of the defects found while creating the token.  Defects
56may appear on any token in the tree, and a composite list of all defects in the
57subtree is available through the 'all_defects' attribute of any node.  (For
58Terminal notes x.defects == x.all_defects.)
59
60Each object in a parse tree is called a 'token', and each has a 'token_type'
61attribute that gives the name from the RFC 5322 grammar that it represents.
62Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
63may be produced: 'ptext'.  A 'ptext' is a string of printable ascii characters.
64It is returned in place of lists of (ctext/quoted-pair) and
65(qtext/quoted-pair).
66
67XXX: provide complete list of token types.
68"""
69
70import re
71import sys
72import urllib   # For urllib.parse.unquote
73from string import hexdigits
74from operator import itemgetter
75from email import _encoded_words as _ew
76from email import errors
77from email import utils
78
79#
80# Useful constants and functions
81#
82
83WSP = set(' \t')
84CFWS_LEADER = WSP | set('(')
85SPECIALS = set(r'()<>@,:;.\"[]')
86ATOM_ENDS = SPECIALS | WSP
87DOT_ATOM_ENDS = ATOM_ENDS - set('.')
88# '.', '"', and '(' do not end phrases in order to support obs-phrase
89PHRASE_ENDS = SPECIALS - set('."(')
90TSPECIALS = (SPECIALS | set('/?=')) - set('.')
91TOKEN_ENDS = TSPECIALS | WSP
92ASPECIALS = TSPECIALS | set("*'%")
93ATTRIBUTE_ENDS = ASPECIALS | WSP
94EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
95
96def quote_string(value):
97    return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
98
99# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
100rfc2047_matcher = re.compile(r'''
101   =\?            # literal =?
102   [^?]*          # charset
103   \?             # literal ?
104   [qQbB]         # literal 'q' or 'b', case insensitive
105   \?             # literal ?
106  .*?             # encoded word
107  \?=             # literal ?=
108''', re.VERBOSE | re.MULTILINE)
109
110
111#
112# TokenList and its subclasses
113#
114
115class TokenList(list):
116
117    token_type = None
118    syntactic_break = True
119    ew_combine_allowed = True
120
121    def __init__(self, *args, **kw):
122        super().__init__(*args, **kw)
123        self.defects = []
124
125    def __str__(self):
126        return ''.join(str(x) for x in self)
127
128    def __repr__(self):
129        return '{}({})'.format(self.__class__.__name__,
130                             super().__repr__())
131
132    @property
133    def value(self):
134        return ''.join(x.value for x in self if x.value)
135
136    @property
137    def all_defects(self):
138        return sum((x.all_defects for x in self), self.defects)
139
140    def startswith_fws(self):
141        return self[0].startswith_fws()
142
143    @property
144    def as_ew_allowed(self):
145        """True if all top level tokens of this part may be RFC2047 encoded."""
146        return all(part.as_ew_allowed for part in self)
147
148    @property
149    def comments(self):
150        comments = []
151        for token in self:
152            comments.extend(token.comments)
153        return comments
154
155    def fold(self, *, policy):
156        return _refold_parse_tree(self, policy=policy)
157
158    def pprint(self, indent=''):
159        print(self.ppstr(indent=indent))
160
161    def ppstr(self, indent=''):
162        return '\n'.join(self._pp(indent=indent))
163
164    def _pp(self, indent=''):
165        yield '{}{}/{}('.format(
166            indent,
167            self.__class__.__name__,
168            self.token_type)
169        for token in self:
170            if not hasattr(token, '_pp'):
171                yield (indent + '    !! invalid element in token '
172                                        'list: {!r}'.format(token))
173            else:
174                yield from token._pp(indent+'    ')
175        if self.defects:
176            extra = ' Defects: {}'.format(self.defects)
177        else:
178            extra = ''
179        yield '{}){}'.format(indent, extra)
180
181
182class WhiteSpaceTokenList(TokenList):
183
184    @property
185    def value(self):
186        return ' '
187
188    @property
189    def comments(self):
190        return [x.content for x in self if x.token_type=='comment']
191
192
193class UnstructuredTokenList(TokenList):
194    token_type = 'unstructured'
195
196
197class Phrase(TokenList):
198    token_type = 'phrase'
199
200class Word(TokenList):
201    token_type = 'word'
202
203
204class CFWSList(WhiteSpaceTokenList):
205    token_type = 'cfws'
206
207
208class Atom(TokenList):
209    token_type = 'atom'
210
211
212class Token(TokenList):
213    token_type = 'token'
214    encode_as_ew = False
215
216
217class EncodedWord(TokenList):
218    token_type = 'encoded-word'
219    cte = None
220    charset = None
221    lang = None
222
223
224class QuotedString(TokenList):
225
226    token_type = 'quoted-string'
227
228    @property
229    def content(self):
230        for x in self:
231            if x.token_type == 'bare-quoted-string':
232                return x.value
233
234    @property
235    def quoted_value(self):
236        res = []
237        for x in self:
238            if x.token_type == 'bare-quoted-string':
239                res.append(str(x))
240            else:
241                res.append(x.value)
242        return ''.join(res)
243
244    @property
245    def stripped_value(self):
246        for token in self:
247            if token.token_type == 'bare-quoted-string':
248                return token.value
249
250
251class BareQuotedString(QuotedString):
252
253    token_type = 'bare-quoted-string'
254
255    def __str__(self):
256        return quote_string(''.join(str(x) for x in self))
257
258    @property
259    def value(self):
260        return ''.join(str(x) for x in self)
261
262
263class Comment(WhiteSpaceTokenList):
264
265    token_type = 'comment'
266
267    def __str__(self):
268        return ''.join(sum([
269                            ["("],
270                            [self.quote(x) for x in self],
271                            [")"],
272                            ], []))
273
274    def quote(self, value):
275        if value.token_type == 'comment':
276            return str(value)
277        return str(value).replace('\\', '\\\\').replace(
278                                  '(', r'\(').replace(
279                                  ')', r'\)')
280
281    @property
282    def content(self):
283        return ''.join(str(x) for x in self)
284
285    @property
286    def comments(self):
287        return [self.content]
288
289class AddressList(TokenList):
290
291    token_type = 'address-list'
292
293    @property
294    def addresses(self):
295        return [x for x in self if x.token_type=='address']
296
297    @property
298    def mailboxes(self):
299        return sum((x.mailboxes
300                    for x in self if x.token_type=='address'), [])
301
302    @property
303    def all_mailboxes(self):
304        return sum((x.all_mailboxes
305                    for x in self if x.token_type=='address'), [])
306
307
308class Address(TokenList):
309
310    token_type = 'address'
311
312    @property
313    def display_name(self):
314        if self[0].token_type == 'group':
315            return self[0].display_name
316
317    @property
318    def mailboxes(self):
319        if self[0].token_type == 'mailbox':
320            return [self[0]]
321        elif self[0].token_type == 'invalid-mailbox':
322            return []
323        return self[0].mailboxes
324
325    @property
326    def all_mailboxes(self):
327        if self[0].token_type == 'mailbox':
328            return [self[0]]
329        elif self[0].token_type == 'invalid-mailbox':
330            return [self[0]]
331        return self[0].all_mailboxes
332
333class MailboxList(TokenList):
334
335    token_type = 'mailbox-list'
336
337    @property
338    def mailboxes(self):
339        return [x for x in self if x.token_type=='mailbox']
340
341    @property
342    def all_mailboxes(self):
343        return [x for x in self
344            if x.token_type in ('mailbox', 'invalid-mailbox')]
345
346
347class GroupList(TokenList):
348
349    token_type = 'group-list'
350
351    @property
352    def mailboxes(self):
353        if not self or self[0].token_type != 'mailbox-list':
354            return []
355        return self[0].mailboxes
356
357    @property
358    def all_mailboxes(self):
359        if not self or self[0].token_type != 'mailbox-list':
360            return []
361        return self[0].all_mailboxes
362
363
364class Group(TokenList):
365
366    token_type = "group"
367
368    @property
369    def mailboxes(self):
370        if self[2].token_type != 'group-list':
371            return []
372        return self[2].mailboxes
373
374    @property
375    def all_mailboxes(self):
376        if self[2].token_type != 'group-list':
377            return []
378        return self[2].all_mailboxes
379
380    @property
381    def display_name(self):
382        return self[0].display_name
383
384
385class NameAddr(TokenList):
386
387    token_type = 'name-addr'
388
389    @property
390    def display_name(self):
391        if len(self) == 1:
392            return None
393        return self[0].display_name
394
395    @property
396    def local_part(self):
397        return self[-1].local_part
398
399    @property
400    def domain(self):
401        return self[-1].domain
402
403    @property
404    def route(self):
405        return self[-1].route
406
407    @property
408    def addr_spec(self):
409        return self[-1].addr_spec
410
411
412class AngleAddr(TokenList):
413
414    token_type = 'angle-addr'
415
416    @property
417    def local_part(self):
418        for x in self:
419            if x.token_type == 'addr-spec':
420                return x.local_part
421
422    @property
423    def domain(self):
424        for x in self:
425            if x.token_type == 'addr-spec':
426                return x.domain
427
428    @property
429    def route(self):
430        for x in self:
431            if x.token_type == 'obs-route':
432                return x.domains
433
434    @property
435    def addr_spec(self):
436        for x in self:
437            if x.token_type == 'addr-spec':
438                if x.local_part:
439                    return x.addr_spec
440                else:
441                    return quote_string(x.local_part) + x.addr_spec
442        else:
443            return '<>'
444
445
446class ObsRoute(TokenList):
447
448    token_type = 'obs-route'
449
450    @property
451    def domains(self):
452        return [x.domain for x in self if x.token_type == 'domain']
453
454
455class Mailbox(TokenList):
456
457    token_type = 'mailbox'
458
459    @property
460    def display_name(self):
461        if self[0].token_type == 'name-addr':
462            return self[0].display_name
463
464    @property
465    def local_part(self):
466        return self[0].local_part
467
468    @property
469    def domain(self):
470        return self[0].domain
471
472    @property
473    def route(self):
474        if self[0].token_type == 'name-addr':
475            return self[0].route
476
477    @property
478    def addr_spec(self):
479        return self[0].addr_spec
480
481
482class InvalidMailbox(TokenList):
483
484    token_type = 'invalid-mailbox'
485
486    @property
487    def display_name(self):
488        return None
489
490    local_part = domain = route = addr_spec = display_name
491
492
493class Domain(TokenList):
494
495    token_type = 'domain'
496    as_ew_allowed = False
497
498    @property
499    def domain(self):
500        return ''.join(super().value.split())
501
502
503class DotAtom(TokenList):
504    token_type = 'dot-atom'
505
506
507class DotAtomText(TokenList):
508    token_type = 'dot-atom-text'
509    as_ew_allowed = True
510
511
512class NoFoldLiteral(TokenList):
513    token_type = 'no-fold-literal'
514    as_ew_allowed = False
515
516
517class AddrSpec(TokenList):
518
519    token_type = 'addr-spec'
520    as_ew_allowed = False
521
522    @property
523    def local_part(self):
524        return self[0].local_part
525
526    @property
527    def domain(self):
528        if len(self) < 3:
529            return None
530        return self[-1].domain
531
532    @property
533    def value(self):
534        if len(self) < 3:
535            return self[0].value
536        return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
537
538    @property
539    def addr_spec(self):
540        nameset = set(self.local_part)
541        if len(nameset) > len(nameset-DOT_ATOM_ENDS):
542            lp = quote_string(self.local_part)
543        else:
544            lp = self.local_part
545        if self.domain is not None:
546            return lp + '@' + self.domain
547        return lp
548
549
550class ObsLocalPart(TokenList):
551
552    token_type = 'obs-local-part'
553    as_ew_allowed = False
554
555
556class DisplayName(Phrase):
557
558    token_type = 'display-name'
559    ew_combine_allowed = False
560
561    @property
562    def display_name(self):
563        res = TokenList(self)
564        if len(res) == 0:
565            return res.value
566        if res[0].token_type == 'cfws':
567            res.pop(0)
568        else:
569            if res[0][0].token_type == 'cfws':
570                res[0] = TokenList(res[0][1:])
571        if res[-1].token_type == 'cfws':
572            res.pop()
573        else:
574            if res[-1][-1].token_type == 'cfws':
575                res[-1] = TokenList(res[-1][:-1])
576        return res.value
577
578    @property
579    def value(self):
580        quote = False
581        if self.defects:
582            quote = True
583        else:
584            for x in self:
585                if x.token_type == 'quoted-string':
586                    quote = True
587        if len(self) != 0 and quote:
588            pre = post = ''
589            if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
590                pre = ' '
591            if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
592                post = ' '
593            return pre+quote_string(self.display_name)+post
594        else:
595            return super().value
596
597
598class LocalPart(TokenList):
599
600    token_type = 'local-part'
601    as_ew_allowed = False
602
603    @property
604    def value(self):
605        if self[0].token_type == "quoted-string":
606            return self[0].quoted_value
607        else:
608            return self[0].value
609
610    @property
611    def local_part(self):
612        # Strip whitespace from front, back, and around dots.
613        res = [DOT]
614        last = DOT
615        last_is_tl = False
616        for tok in self[0] + [DOT]:
617            if tok.token_type == 'cfws':
618                continue
619            if (last_is_tl and tok.token_type == 'dot' and
620                    last[-1].token_type == 'cfws'):
621                res[-1] = TokenList(last[:-1])
622            is_tl = isinstance(tok, TokenList)
623            if (is_tl and last.token_type == 'dot' and
624                    tok[0].token_type == 'cfws'):
625                res.append(TokenList(tok[1:]))
626            else:
627                res.append(tok)
628            last = res[-1]
629            last_is_tl = is_tl
630        res = TokenList(res[1:-1])
631        return res.value
632
633
634class DomainLiteral(TokenList):
635
636    token_type = 'domain-literal'
637    as_ew_allowed = False
638
639    @property
640    def domain(self):
641        return ''.join(super().value.split())
642
643    @property
644    def ip(self):
645        for x in self:
646            if x.token_type == 'ptext':
647                return x.value
648
649
650class MIMEVersion(TokenList):
651
652    token_type = 'mime-version'
653    major = None
654    minor = None
655
656
657class Parameter(TokenList):
658
659    token_type = 'parameter'
660    sectioned = False
661    extended = False
662    charset = 'us-ascii'
663
664    @property
665    def section_number(self):
666        # Because the first token, the attribute (name) eats CFWS, the second
667        # token is always the section if there is one.
668        return self[1].number if self.sectioned else 0
669
670    @property
671    def param_value(self):
672        # This is part of the "handle quoted extended parameters" hack.
673        for token in self:
674            if token.token_type == 'value':
675                return token.stripped_value
676            if token.token_type == 'quoted-string':
677                for token in token:
678                    if token.token_type == 'bare-quoted-string':
679                        for token in token:
680                            if token.token_type == 'value':
681                                return token.stripped_value
682        return ''
683
684
685class InvalidParameter(Parameter):
686
687    token_type = 'invalid-parameter'
688
689
690class Attribute(TokenList):
691
692    token_type = 'attribute'
693
694    @property
695    def stripped_value(self):
696        for token in self:
697            if token.token_type.endswith('attrtext'):
698                return token.value
699
700class Section(TokenList):
701
702    token_type = 'section'
703    number = None
704
705
706class Value(TokenList):
707
708    token_type = 'value'
709
710    @property
711    def stripped_value(self):
712        token = self[0]
713        if token.token_type == 'cfws':
714            token = self[1]
715        if token.token_type.endswith(
716                ('quoted-string', 'attribute', 'extended-attribute')):
717            return token.stripped_value
718        return self.value
719
720
721class MimeParameters(TokenList):
722
723    token_type = 'mime-parameters'
724    syntactic_break = False
725
726    @property
727    def params(self):
728        # The RFC specifically states that the ordering of parameters is not
729        # guaranteed and may be reordered by the transport layer.  So we have
730        # to assume the RFC 2231 pieces can come in any order.  However, we
731        # output them in the order that we first see a given name, which gives
732        # us a stable __str__.
733        params = {}  # Using order preserving dict from Python 3.7+
734        for token in self:
735            if not token.token_type.endswith('parameter'):
736                continue
737            if token[0].token_type != 'attribute':
738                continue
739            name = token[0].value.strip()
740            if name not in params:
741                params[name] = []
742            params[name].append((token.section_number, token))
743        for name, parts in params.items():
744            parts = sorted(parts, key=itemgetter(0))
745            first_param = parts[0][1]
746            charset = first_param.charset
747            # Our arbitrary error recovery is to ignore duplicate parameters,
748            # to use appearance order if there are duplicate rfc 2231 parts,
749            # and to ignore gaps.  This mimics the error recovery of get_param.
750            if not first_param.extended and len(parts) > 1:
751                if parts[1][0] == 0:
752                    parts[1][1].defects.append(errors.InvalidHeaderDefect(
753                        'duplicate parameter name; duplicate(s) ignored'))
754                    parts = parts[:1]
755                # Else assume the *0* was missing...note that this is different
756                # from get_param, but we registered a defect for this earlier.
757            value_parts = []
758            i = 0
759            for section_number, param in parts:
760                if section_number != i:
761                    # We could get fancier here and look for a complete
762                    # duplicate extended parameter and ignore the second one
763                    # seen.  But we're not doing that.  The old code didn't.
764                    if not param.extended:
765                        param.defects.append(errors.InvalidHeaderDefect(
766                            'duplicate parameter name; duplicate ignored'))
767                        continue
768                    else:
769                        param.defects.append(errors.InvalidHeaderDefect(
770                            "inconsistent RFC2231 parameter numbering"))
771                i += 1
772                value = param.param_value
773                if param.extended:
774                    try:
775                        value = urllib.parse.unquote_to_bytes(value)
776                    except UnicodeEncodeError:
777                        # source had surrogate escaped bytes.  What we do now
778                        # is a bit of an open question.  I'm not sure this is
779                        # the best choice, but it is what the old algorithm did
780                        value = urllib.parse.unquote(value, encoding='latin-1')
781                    else:
782                        try:
783                            value = value.decode(charset, 'surrogateescape')
784                        except LookupError:
785                            # XXX: there should really be a custom defect for
786                            # unknown character set to make it easy to find,
787                            # because otherwise unknown charset is a silent
788                            # failure.
789                            value = value.decode('us-ascii', 'surrogateescape')
790                        if utils._has_surrogates(value):
791                            param.defects.append(errors.UndecodableBytesDefect())
792                value_parts.append(value)
793            value = ''.join(value_parts)
794            yield name, value
795
796    def __str__(self):
797        params = []
798        for name, value in self.params:
799            if value:
800                params.append('{}={}'.format(name, quote_string(value)))
801            else:
802                params.append(name)
803        params = '; '.join(params)
804        return ' ' + params if params else ''
805
806
807class ParameterizedHeaderValue(TokenList):
808
809    # Set this false so that the value doesn't wind up on a new line even
810    # if it and the parameters would fit there but not on the first line.
811    syntactic_break = False
812
813    @property
814    def params(self):
815        for token in reversed(self):
816            if token.token_type == 'mime-parameters':
817                return token.params
818        return {}
819
820
821class ContentType(ParameterizedHeaderValue):
822    token_type = 'content-type'
823    as_ew_allowed = False
824    maintype = 'text'
825    subtype = 'plain'
826
827
828class ContentDisposition(ParameterizedHeaderValue):
829    token_type = 'content-disposition'
830    as_ew_allowed = False
831    content_disposition = None
832
833
834class ContentTransferEncoding(TokenList):
835    token_type = 'content-transfer-encoding'
836    as_ew_allowed = False
837    cte = '7bit'
838
839
840class HeaderLabel(TokenList):
841    token_type = 'header-label'
842    as_ew_allowed = False
843
844
845class MsgID(TokenList):
846    token_type = 'msg-id'
847    as_ew_allowed = False
848
849    def fold(self, policy):
850        # message-id tokens may not be folded.
851        return str(self) + policy.linesep
852
853
854class MessageID(MsgID):
855    token_type = 'message-id'
856
857
858class InvalidMessageID(MessageID):
859    token_type = 'invalid-message-id'
860
861
862class Header(TokenList):
863    token_type = 'header'
864
865
866#
867# Terminal classes and instances
868#
869
870class Terminal(str):
871
872    as_ew_allowed = True
873    ew_combine_allowed = True
874    syntactic_break = True
875
876    def __new__(cls, value, token_type):
877        self = super().__new__(cls, value)
878        self.token_type = token_type
879        self.defects = []
880        return self
881
882    def __repr__(self):
883        return "{}({})".format(self.__class__.__name__, super().__repr__())
884
885    def pprint(self):
886        print(self.__class__.__name__ + '/' + self.token_type)
887
888    @property
889    def all_defects(self):
890        return list(self.defects)
891
892    def _pp(self, indent=''):
893        return ["{}{}/{}({}){}".format(
894            indent,
895            self.__class__.__name__,
896            self.token_type,
897            super().__repr__(),
898            '' if not self.defects else ' {}'.format(self.defects),
899            )]
900
901    def pop_trailing_ws(self):
902        # This terminates the recursion.
903        return None
904
905    @property
906    def comments(self):
907        return []
908
909    def __getnewargs__(self):
910        return(str(self), self.token_type)
911
912
913class WhiteSpaceTerminal(Terminal):
914
915    @property
916    def value(self):
917        return ' '
918
919    def startswith_fws(self):
920        return True
921
922
923class ValueTerminal(Terminal):
924
925    @property
926    def value(self):
927        return self
928
929    def startswith_fws(self):
930        return False
931
932
933class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
934
935    @property
936    def value(self):
937        return ''
938
939    def __str__(self):
940        return ''
941
942
943class _InvalidEwError(errors.HeaderParseError):
944    """Invalid encoded word found while parsing headers."""
945
946
947# XXX these need to become classes and used as instances so
948# that a program can't change them in a parse tree and screw
949# up other parse trees.  Maybe should have  tests for that, too.
950DOT = ValueTerminal('.', 'dot')
951ListSeparator = ValueTerminal(',', 'list-separator')
952RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
953
954#
955# Parser
956#
957
958# Parse strings according to RFC822/2047/2822/5322 rules.
959#
960# This is a stateless parser.  Each get_XXX function accepts a string and
961# returns either a Terminal or a TokenList representing the RFC object named
962# by the method and a string containing the remaining unparsed characters
963# from the input.  Thus a parser method consumes the next syntactic construct
964# of a given type and returns a token representing the construct plus the
965# unparsed remainder of the input string.
966#
967# For example, if the first element of a structured header is a 'phrase',
968# then:
969#
970#     phrase, value = get_phrase(value)
971#
972# returns the complete phrase from the start of the string value, plus any
973# characters left in the string after the phrase is removed.
974
975_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
976_non_atom_end_matcher = re.compile(r"[^{}]+".format(
977    re.escape(''.join(ATOM_ENDS)))).match
978_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
979_non_token_end_matcher = re.compile(r"[^{}]+".format(
980    re.escape(''.join(TOKEN_ENDS)))).match
981_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
982    re.escape(''.join(ATTRIBUTE_ENDS)))).match
983_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
984    re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match
985
986def _validate_xtext(xtext):
987    """If input token contains ASCII non-printables, register a defect."""
988
989    non_printables = _non_printable_finder(xtext)
990    if non_printables:
991        xtext.defects.append(errors.NonPrintableDefect(non_printables))
992    if utils._has_surrogates(xtext):
993        xtext.defects.append(errors.UndecodableBytesDefect(
994            "Non-ASCII characters found in header token"))
995
996def _get_ptext_to_endchars(value, endchars):
997    """Scan printables/quoted-pairs until endchars and return unquoted ptext.
998
999    This function turns a run of qcontent, ccontent-without-comments, or
1000    dtext-with-quoted-printables into a single string by unquoting any
1001    quoted printables.  It returns the string, the remaining value, and
1002    a flag that is True iff there were any quoted printables decoded.
1003
1004    """
1005    fragment, *remainder = _wsp_splitter(value, 1)
1006    vchars = []
1007    escape = False
1008    had_qp = False
1009    for pos in range(len(fragment)):
1010        if fragment[pos] == '\\':
1011            if escape:
1012                escape = False
1013                had_qp = True
1014            else:
1015                escape = True
1016                continue
1017        if escape:
1018            escape = False
1019        elif fragment[pos] in endchars:
1020            break
1021        vchars.append(fragment[pos])
1022    else:
1023        pos = pos + 1
1024    return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
1025
1026def get_fws(value):
1027    """FWS = 1*WSP
1028
1029    This isn't the RFC definition.  We're using fws to represent tokens where
1030    folding can be done, but when we are parsing the *un*folding has already
1031    been done so we don't need to watch out for CRLF.
1032
1033    """
1034    newvalue = value.lstrip()
1035    fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
1036    return fws, newvalue
1037
1038def get_encoded_word(value):
1039    """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
1040
1041    """
1042    ew = EncodedWord()
1043    if not value.startswith('=?'):
1044        raise errors.HeaderParseError(
1045            "expected encoded word but found {}".format(value))
1046    tok, *remainder = value[2:].split('?=', 1)
1047    if tok == value[2:]:
1048        raise errors.HeaderParseError(
1049            "expected encoded word but found {}".format(value))
1050    remstr = ''.join(remainder)
1051    if (len(remstr) > 1 and
1052        remstr[0] in hexdigits and
1053        remstr[1] in hexdigits and
1054        tok.count('?') < 2):
1055        # The ? after the CTE was followed by an encoded word escape (=XX).
1056        rest, *remainder = remstr.split('?=', 1)
1057        tok = tok + '?=' + rest
1058    if len(tok.split()) > 1:
1059        ew.defects.append(errors.InvalidHeaderDefect(
1060            "whitespace inside encoded word"))
1061    ew.cte = value
1062    value = ''.join(remainder)
1063    try:
1064        text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
1065    except (ValueError, KeyError):
1066        raise _InvalidEwError(
1067            "encoded word format invalid: '{}'".format(ew.cte))
1068    ew.charset = charset
1069    ew.lang = lang
1070    ew.defects.extend(defects)
1071    while text:
1072        if text[0] in WSP:
1073            token, text = get_fws(text)
1074            ew.append(token)
1075            continue
1076        chars, *remainder = _wsp_splitter(text, 1)
1077        vtext = ValueTerminal(chars, 'vtext')
1078        _validate_xtext(vtext)
1079        ew.append(vtext)
1080        text = ''.join(remainder)
1081    # Encoded words should be followed by a WS
1082    if value and value[0] not in WSP:
1083        ew.defects.append(errors.InvalidHeaderDefect(
1084            "missing trailing whitespace after encoded-word"))
1085    return ew, value
1086
1087def get_unstructured(value):
1088    """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct
1089       obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS)
1090       obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
1091
1092       obs-NO-WS-CTL is control characters except WSP/CR/LF.
1093
1094    So, basically, we have printable runs, plus control characters or nulls in
1095    the obsolete syntax, separated by whitespace.  Since RFC 2047 uses the
1096    obsolete syntax in its specification, but requires whitespace on either
1097    side of the encoded words, I can see no reason to need to separate the
1098    non-printable-non-whitespace from the printable runs if they occur, so we
1099    parse this into xtext tokens separated by WSP tokens.
1100
1101    Because an 'unstructured' value must by definition constitute the entire
1102    value, this 'get' routine does not return a remaining value, only the
1103    parsed TokenList.
1104
1105    """
1106    # XXX: but what about bare CR and LF?  They might signal the start or
1107    # end of an encoded word.  YAGNI for now, since our current parsers
1108    # will never send us strings with bare CR or LF.
1109
1110    unstructured = UnstructuredTokenList()
1111    while value:
1112        if value[0] in WSP:
1113            token, value = get_fws(value)
1114            unstructured.append(token)
1115            continue
1116        valid_ew = True
1117        if value.startswith('=?'):
1118            try:
1119                token, value = get_encoded_word(value)
1120            except _InvalidEwError:
1121                valid_ew = False
1122            except errors.HeaderParseError:
1123                # XXX: Need to figure out how to register defects when
1124                # appropriate here.
1125                pass
1126            else:
1127                have_ws = True
1128                if len(unstructured) > 0:
1129                    if unstructured[-1].token_type != 'fws':
1130                        unstructured.defects.append(errors.InvalidHeaderDefect(
1131                            "missing whitespace before encoded word"))
1132                        have_ws = False
1133                if have_ws and len(unstructured) > 1:
1134                    if unstructured[-2].token_type == 'encoded-word':
1135                        unstructured[-1] = EWWhiteSpaceTerminal(
1136                            unstructured[-1], 'fws')
1137                unstructured.append(token)
1138                continue
1139        tok, *remainder = _wsp_splitter(value, 1)
1140        # Split in the middle of an atom if there is a rfc2047 encoded word
1141        # which does not have WSP on both sides. The defect will be registered
1142        # the next time through the loop.
1143        # This needs to only be performed when the encoded word is valid;
1144        # otherwise, performing it on an invalid encoded word can cause
1145        # the parser to go in an infinite loop.
1146        if valid_ew and rfc2047_matcher.search(tok):
1147            tok, *remainder = value.partition('=?')
1148        vtext = ValueTerminal(tok, 'vtext')
1149        _validate_xtext(vtext)
1150        unstructured.append(vtext)
1151        value = ''.join(remainder)
1152    return unstructured
1153
1154def get_qp_ctext(value):
1155    r"""ctext = <printable ascii except \ ( )>
1156
1157    This is not the RFC ctext, since we are handling nested comments in comment
1158    and unquoting quoted-pairs here.  We allow anything except the '()'
1159    characters, but if we find any ASCII other than the RFC defined printable
1160    ASCII, a NonPrintableDefect is added to the token's defects list.  Since
1161    quoted pairs are converted to their unquoted values, what is returned is
1162    a 'ptext' token.  In this case it is a WhiteSpaceTerminal, so it's value
1163    is ' '.
1164
1165    """
1166    ptext, value, _ = _get_ptext_to_endchars(value, '()')
1167    ptext = WhiteSpaceTerminal(ptext, 'ptext')
1168    _validate_xtext(ptext)
1169    return ptext, value
1170
1171def get_qcontent(value):
1172    """qcontent = qtext / quoted-pair
1173
1174    We allow anything except the DQUOTE character, but if we find any ASCII
1175    other than the RFC defined printable ASCII, a NonPrintableDefect is
1176    added to the token's defects list.  Any quoted pairs are converted to their
1177    unquoted values, so what is returned is a 'ptext' token.  In this case it
1178    is a ValueTerminal.
1179
1180    """
1181    ptext, value, _ = _get_ptext_to_endchars(value, '"')
1182    ptext = ValueTerminal(ptext, 'ptext')
1183    _validate_xtext(ptext)
1184    return ptext, value
1185
1186def get_atext(value):
1187    """atext = <matches _atext_matcher>
1188
1189    We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
1190    the token's defects list if we find non-atext characters.
1191    """
1192    m = _non_atom_end_matcher(value)
1193    if not m:
1194        raise errors.HeaderParseError(
1195            "expected atext but found '{}'".format(value))
1196    atext = m.group()
1197    value = value[len(atext):]
1198    atext = ValueTerminal(atext, 'atext')
1199    _validate_xtext(atext)
1200    return atext, value
1201
1202def get_bare_quoted_string(value):
1203    """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
1204
1205    A quoted-string without the leading or trailing white space.  Its
1206    value is the text between the quote marks, with whitespace
1207    preserved and quoted pairs decoded.
1208    """
1209    if value[0] != '"':
1210        raise errors.HeaderParseError(
1211            "expected '\"' but found '{}'".format(value))
1212    bare_quoted_string = BareQuotedString()
1213    value = value[1:]
1214    if value and value[0] == '"':
1215        token, value = get_qcontent(value)
1216        bare_quoted_string.append(token)
1217    while value and value[0] != '"':
1218        if value[0] in WSP:
1219            token, value = get_fws(value)
1220        elif value[:2] == '=?':
1221            valid_ew = False
1222            try:
1223                token, value = get_encoded_word(value)
1224                bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1225                    "encoded word inside quoted string"))
1226                valid_ew = True
1227            except errors.HeaderParseError:
1228                token, value = get_qcontent(value)
1229            # Collapse the whitespace between two encoded words that occur in a
1230            # bare-quoted-string.
1231            if valid_ew and len(bare_quoted_string) > 1:
1232                if (bare_quoted_string[-1].token_type == 'fws' and
1233                        bare_quoted_string[-2].token_type == 'encoded-word'):
1234                    bare_quoted_string[-1] = EWWhiteSpaceTerminal(
1235                        bare_quoted_string[-1], 'fws')
1236        else:
1237            token, value = get_qcontent(value)
1238        bare_quoted_string.append(token)
1239    if not value:
1240        bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1241            "end of header inside quoted string"))
1242        return bare_quoted_string, value
1243    return bare_quoted_string, value[1:]
1244
1245def get_comment(value):
1246    """comment = "(" *([FWS] ccontent) [FWS] ")"
1247       ccontent = ctext / quoted-pair / comment
1248
1249    We handle nested comments here, and quoted-pair in our qp-ctext routine.
1250    """
1251    if value and value[0] != '(':
1252        raise errors.HeaderParseError(
1253            "expected '(' but found '{}'".format(value))
1254    comment = Comment()
1255    value = value[1:]
1256    while value and value[0] != ")":
1257        if value[0] in WSP:
1258            token, value = get_fws(value)
1259        elif value[0] == '(':
1260            token, value = get_comment(value)
1261        else:
1262            token, value = get_qp_ctext(value)
1263        comment.append(token)
1264    if not value:
1265        comment.defects.append(errors.InvalidHeaderDefect(
1266            "end of header inside comment"))
1267        return comment, value
1268    return comment, value[1:]
1269
1270def get_cfws(value):
1271    """CFWS = (1*([FWS] comment) [FWS]) / FWS
1272
1273    """
1274    cfws = CFWSList()
1275    while value and value[0] in CFWS_LEADER:
1276        if value[0] in WSP:
1277            token, value = get_fws(value)
1278        else:
1279            token, value = get_comment(value)
1280        cfws.append(token)
1281    return cfws, value
1282
1283def get_quoted_string(value):
1284    """quoted-string = [CFWS] <bare-quoted-string> [CFWS]
1285
1286    'bare-quoted-string' is an intermediate class defined by this
1287    parser and not by the RFC grammar.  It is the quoted string
1288    without any attached CFWS.
1289    """
1290    quoted_string = QuotedString()
1291    if value and value[0] in CFWS_LEADER:
1292        token, value = get_cfws(value)
1293        quoted_string.append(token)
1294    token, value = get_bare_quoted_string(value)
1295    quoted_string.append(token)
1296    if value and value[0] in CFWS_LEADER:
1297        token, value = get_cfws(value)
1298        quoted_string.append(token)
1299    return quoted_string, value
1300
1301def get_atom(value):
1302    """atom = [CFWS] 1*atext [CFWS]
1303
1304    An atom could be an rfc2047 encoded word.
1305    """
1306    atom = Atom()
1307    if value and value[0] in CFWS_LEADER:
1308        token, value = get_cfws(value)
1309        atom.append(token)
1310    if value and value[0] in ATOM_ENDS:
1311        raise errors.HeaderParseError(
1312            "expected atom but found '{}'".format(value))
1313    if value.startswith('=?'):
1314        try:
1315            token, value = get_encoded_word(value)
1316        except errors.HeaderParseError:
1317            # XXX: need to figure out how to register defects when
1318            # appropriate here.
1319            token, value = get_atext(value)
1320    else:
1321        token, value = get_atext(value)
1322    atom.append(token)
1323    if value and value[0] in CFWS_LEADER:
1324        token, value = get_cfws(value)
1325        atom.append(token)
1326    return atom, value
1327
1328def get_dot_atom_text(value):
1329    """ dot-text = 1*atext *("." 1*atext)
1330
1331    """
1332    dot_atom_text = DotAtomText()
1333    if not value or value[0] in ATOM_ENDS:
1334        raise errors.HeaderParseError("expected atom at a start of "
1335            "dot-atom-text but found '{}'".format(value))
1336    while value and value[0] not in ATOM_ENDS:
1337        token, value = get_atext(value)
1338        dot_atom_text.append(token)
1339        if value and value[0] == '.':
1340            dot_atom_text.append(DOT)
1341            value = value[1:]
1342    if dot_atom_text[-1] is DOT:
1343        raise errors.HeaderParseError("expected atom at end of dot-atom-text "
1344            "but found '{}'".format('.'+value))
1345    return dot_atom_text, value
1346
1347def get_dot_atom(value):
1348    """ dot-atom = [CFWS] dot-atom-text [CFWS]
1349
1350    Any place we can have a dot atom, we could instead have an rfc2047 encoded
1351    word.
1352    """
1353    dot_atom = DotAtom()
1354    if value[0] in CFWS_LEADER:
1355        token, value = get_cfws(value)
1356        dot_atom.append(token)
1357    if value.startswith('=?'):
1358        try:
1359            token, value = get_encoded_word(value)
1360        except errors.HeaderParseError:
1361            # XXX: need to figure out how to register defects when
1362            # appropriate here.
1363            token, value = get_dot_atom_text(value)
1364    else:
1365        token, value = get_dot_atom_text(value)
1366    dot_atom.append(token)
1367    if value and value[0] in CFWS_LEADER:
1368        token, value = get_cfws(value)
1369        dot_atom.append(token)
1370    return dot_atom, value
1371
1372def get_word(value):
1373    """word = atom / quoted-string
1374
1375    Either atom or quoted-string may start with CFWS.  We have to peel off this
1376    CFWS first to determine which type of word to parse.  Afterward we splice
1377    the leading CFWS, if any, into the parsed sub-token.
1378
1379    If neither an atom or a quoted-string is found before the next special, a
1380    HeaderParseError is raised.
1381
1382    The token returned is either an Atom or a QuotedString, as appropriate.
1383    This means the 'word' level of the formal grammar is not represented in the
1384    parse tree; this is because having that extra layer when manipulating the
1385    parse tree is more confusing than it is helpful.
1386
1387    """
1388    if value[0] in CFWS_LEADER:
1389        leader, value = get_cfws(value)
1390    else:
1391        leader = None
1392    if not value:
1393        raise errors.HeaderParseError(
1394            "Expected 'atom' or 'quoted-string' but found nothing.")
1395    if value[0]=='"':
1396        token, value = get_quoted_string(value)
1397    elif value[0] in SPECIALS:
1398        raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
1399                                      "but found '{}'".format(value))
1400    else:
1401        token, value = get_atom(value)
1402    if leader is not None:
1403        token[:0] = [leader]
1404    return token, value
1405
1406def get_phrase(value):
1407    """ phrase = 1*word / obs-phrase
1408        obs-phrase = word *(word / "." / CFWS)
1409
1410    This means a phrase can be a sequence of words, periods, and CFWS in any
1411    order as long as it starts with at least one word.  If anything other than
1412    words is detected, an ObsoleteHeaderDefect is added to the token's defect
1413    list.  We also accept a phrase that starts with CFWS followed by a dot;
1414    this is registered as an InvalidHeaderDefect, since it is not supported by
1415    even the obsolete grammar.
1416
1417    """
1418    phrase = Phrase()
1419    try:
1420        token, value = get_word(value)
1421        phrase.append(token)
1422    except errors.HeaderParseError:
1423        phrase.defects.append(errors.InvalidHeaderDefect(
1424            "phrase does not start with word"))
1425    while value and value[0] not in PHRASE_ENDS:
1426        if value[0]=='.':
1427            phrase.append(DOT)
1428            phrase.defects.append(errors.ObsoleteHeaderDefect(
1429                "period in 'phrase'"))
1430            value = value[1:]
1431        else:
1432            try:
1433                token, value = get_word(value)
1434            except errors.HeaderParseError:
1435                if value[0] in CFWS_LEADER:
1436                    token, value = get_cfws(value)
1437                    phrase.defects.append(errors.ObsoleteHeaderDefect(
1438                        "comment found without atom"))
1439                else:
1440                    raise
1441            phrase.append(token)
1442    return phrase, value
1443
1444def get_local_part(value):
1445    """ local-part = dot-atom / quoted-string / obs-local-part
1446
1447    """
1448    local_part = LocalPart()
1449    leader = None
1450    if value[0] in CFWS_LEADER:
1451        leader, value = get_cfws(value)
1452    if not value:
1453        raise errors.HeaderParseError(
1454            "expected local-part but found '{}'".format(value))
1455    try:
1456        token, value = get_dot_atom(value)
1457    except errors.HeaderParseError:
1458        try:
1459            token, value = get_word(value)
1460        except errors.HeaderParseError:
1461            if value[0] != '\\' and value[0] in PHRASE_ENDS:
1462                raise
1463            token = TokenList()
1464    if leader is not None:
1465        token[:0] = [leader]
1466    local_part.append(token)
1467    if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1468        obs_local_part, value = get_obs_local_part(str(local_part) + value)
1469        if obs_local_part.token_type == 'invalid-obs-local-part':
1470            local_part.defects.append(errors.InvalidHeaderDefect(
1471                "local-part is not dot-atom, quoted-string, or obs-local-part"))
1472        else:
1473            local_part.defects.append(errors.ObsoleteHeaderDefect(
1474                "local-part is not a dot-atom (contains CFWS)"))
1475        local_part[0] = obs_local_part
1476    try:
1477        local_part.value.encode('ascii')
1478    except UnicodeEncodeError:
1479        local_part.defects.append(errors.NonASCIILocalPartDefect(
1480                "local-part contains non-ASCII characters)"))
1481    return local_part, value
1482
1483def get_obs_local_part(value):
1484    """ obs-local-part = word *("." word)
1485    """
1486    obs_local_part = ObsLocalPart()
1487    last_non_ws_was_dot = False
1488    while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1489        if value[0] == '.':
1490            if last_non_ws_was_dot:
1491                obs_local_part.defects.append(errors.InvalidHeaderDefect(
1492                    "invalid repeated '.'"))
1493            obs_local_part.append(DOT)
1494            last_non_ws_was_dot = True
1495            value = value[1:]
1496            continue
1497        elif value[0]=='\\':
1498            obs_local_part.append(ValueTerminal(value[0],
1499                                                'misplaced-special'))
1500            value = value[1:]
1501            obs_local_part.defects.append(errors.InvalidHeaderDefect(
1502                "'\\' character outside of quoted-string/ccontent"))
1503            last_non_ws_was_dot = False
1504            continue
1505        if obs_local_part and obs_local_part[-1].token_type != 'dot':
1506            obs_local_part.defects.append(errors.InvalidHeaderDefect(
1507                "missing '.' between words"))
1508        try:
1509            token, value = get_word(value)
1510            last_non_ws_was_dot = False
1511        except errors.HeaderParseError:
1512            if value[0] not in CFWS_LEADER:
1513                raise
1514            token, value = get_cfws(value)
1515        obs_local_part.append(token)
1516    if (obs_local_part[0].token_type == 'dot' or
1517            obs_local_part[0].token_type=='cfws' and
1518            obs_local_part[1].token_type=='dot'):
1519        obs_local_part.defects.append(errors.InvalidHeaderDefect(
1520            "Invalid leading '.' in local part"))
1521    if (obs_local_part[-1].token_type == 'dot' or
1522            obs_local_part[-1].token_type=='cfws' and
1523            obs_local_part[-2].token_type=='dot'):
1524        obs_local_part.defects.append(errors.InvalidHeaderDefect(
1525            "Invalid trailing '.' in local part"))
1526    if obs_local_part.defects:
1527        obs_local_part.token_type = 'invalid-obs-local-part'
1528    return obs_local_part, value
1529
1530def get_dtext(value):
1531    r""" dtext = <printable ascii except \ [ ]> / obs-dtext
1532        obs-dtext = obs-NO-WS-CTL / quoted-pair
1533
1534    We allow anything except the excluded characters, but if we find any
1535    ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
1536    added to the token's defects list.  Quoted pairs are converted to their
1537    unquoted values, so what is returned is a ptext token, in this case a
1538    ValueTerminal.  If there were quoted-printables, an ObsoleteHeaderDefect is
1539    added to the returned token's defect list.
1540
1541    """
1542    ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
1543    ptext = ValueTerminal(ptext, 'ptext')
1544    if had_qp:
1545        ptext.defects.append(errors.ObsoleteHeaderDefect(
1546            "quoted printable found in domain-literal"))
1547    _validate_xtext(ptext)
1548    return ptext, value
1549
1550def _check_for_early_dl_end(value, domain_literal):
1551    if value:
1552        return False
1553    domain_literal.append(errors.InvalidHeaderDefect(
1554        "end of input inside domain-literal"))
1555    domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1556    return True
1557
1558def get_domain_literal(value):
1559    """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
1560
1561    """
1562    domain_literal = DomainLiteral()
1563    if value[0] in CFWS_LEADER:
1564        token, value = get_cfws(value)
1565        domain_literal.append(token)
1566    if not value:
1567        raise errors.HeaderParseError("expected domain-literal")
1568    if value[0] != '[':
1569        raise errors.HeaderParseError("expected '[' at start of domain-literal "
1570                "but found '{}'".format(value))
1571    value = value[1:]
1572    if _check_for_early_dl_end(value, domain_literal):
1573        return domain_literal, value
1574    domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
1575    if value[0] in WSP:
1576        token, value = get_fws(value)
1577        domain_literal.append(token)
1578    token, value = get_dtext(value)
1579    domain_literal.append(token)
1580    if _check_for_early_dl_end(value, domain_literal):
1581        return domain_literal, value
1582    if value[0] in WSP:
1583        token, value = get_fws(value)
1584        domain_literal.append(token)
1585    if _check_for_early_dl_end(value, domain_literal):
1586        return domain_literal, value
1587    if value[0] != ']':
1588        raise errors.HeaderParseError("expected ']' at end of domain-literal "
1589                "but found '{}'".format(value))
1590    domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1591    value = value[1:]
1592    if value and value[0] in CFWS_LEADER:
1593        token, value = get_cfws(value)
1594        domain_literal.append(token)
1595    return domain_literal, value
1596
1597def get_domain(value):
1598    """ domain = dot-atom / domain-literal / obs-domain
1599        obs-domain = atom *("." atom))
1600
1601    """
1602    domain = Domain()
1603    leader = None
1604    if value[0] in CFWS_LEADER:
1605        leader, value = get_cfws(value)
1606    if not value:
1607        raise errors.HeaderParseError(
1608            "expected domain but found '{}'".format(value))
1609    if value[0] == '[':
1610        token, value = get_domain_literal(value)
1611        if leader is not None:
1612            token[:0] = [leader]
1613        domain.append(token)
1614        return domain, value
1615    try:
1616        token, value = get_dot_atom(value)
1617    except errors.HeaderParseError:
1618        token, value = get_atom(value)
1619    if value and value[0] == '@':
1620        raise errors.HeaderParseError('Invalid Domain')
1621    if leader is not None:
1622        token[:0] = [leader]
1623    domain.append(token)
1624    if value and value[0] == '.':
1625        domain.defects.append(errors.ObsoleteHeaderDefect(
1626            "domain is not a dot-atom (contains CFWS)"))
1627        if domain[0].token_type == 'dot-atom':
1628            domain[:] = domain[0]
1629        while value and value[0] == '.':
1630            domain.append(DOT)
1631            token, value = get_atom(value[1:])
1632            domain.append(token)
1633    return domain, value
1634
1635def get_addr_spec(value):
1636    """ addr-spec = local-part "@" domain
1637
1638    """
1639    addr_spec = AddrSpec()
1640    token, value = get_local_part(value)
1641    addr_spec.append(token)
1642    if not value or value[0] != '@':
1643        addr_spec.defects.append(errors.InvalidHeaderDefect(
1644            "addr-spec local part with no domain"))
1645        return addr_spec, value
1646    addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
1647    token, value = get_domain(value[1:])
1648    addr_spec.append(token)
1649    return addr_spec, value
1650
1651def get_obs_route(value):
1652    """ obs-route = obs-domain-list ":"
1653        obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain])
1654
1655        Returns an obs-route token with the appropriate sub-tokens (that is,
1656        there is no obs-domain-list in the parse tree).
1657    """
1658    obs_route = ObsRoute()
1659    while value and (value[0]==',' or value[0] in CFWS_LEADER):
1660        if value[0] in CFWS_LEADER:
1661            token, value = get_cfws(value)
1662            obs_route.append(token)
1663        elif value[0] == ',':
1664            obs_route.append(ListSeparator)
1665            value = value[1:]
1666    if not value or value[0] != '@':
1667        raise errors.HeaderParseError(
1668            "expected obs-route domain but found '{}'".format(value))
1669    obs_route.append(RouteComponentMarker)
1670    token, value = get_domain(value[1:])
1671    obs_route.append(token)
1672    while value and value[0]==',':
1673        obs_route.append(ListSeparator)
1674        value = value[1:]
1675        if not value:
1676            break
1677        if value[0] in CFWS_LEADER:
1678            token, value = get_cfws(value)
1679            obs_route.append(token)
1680        if value[0] == '@':
1681            obs_route.append(RouteComponentMarker)
1682            token, value = get_domain(value[1:])
1683            obs_route.append(token)
1684    if not value:
1685        raise errors.HeaderParseError("end of header while parsing obs-route")
1686    if value[0] != ':':
1687        raise errors.HeaderParseError( "expected ':' marking end of "
1688            "obs-route but found '{}'".format(value))
1689    obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
1690    return obs_route, value[1:]
1691
1692def get_angle_addr(value):
1693    """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
1694        obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
1695
1696    """
1697    angle_addr = AngleAddr()
1698    if value[0] in CFWS_LEADER:
1699        token, value = get_cfws(value)
1700        angle_addr.append(token)
1701    if not value or value[0] != '<':
1702        raise errors.HeaderParseError(
1703            "expected angle-addr but found '{}'".format(value))
1704    angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
1705    value = value[1:]
1706    # Although it is not legal per RFC5322, SMTP uses '<>' in certain
1707    # circumstances.
1708    if value[0] == '>':
1709        angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
1710        angle_addr.defects.append(errors.InvalidHeaderDefect(
1711            "null addr-spec in angle-addr"))
1712        value = value[1:]
1713        return angle_addr, value
1714    try:
1715        token, value = get_addr_spec(value)
1716    except errors.HeaderParseError:
1717        try:
1718            token, value = get_obs_route(value)
1719            angle_addr.defects.append(errors.ObsoleteHeaderDefect(
1720                "obsolete route specification in angle-addr"))
1721        except errors.HeaderParseError:
1722            raise errors.HeaderParseError(
1723                "expected addr-spec or obs-route but found '{}'".format(value))
1724        angle_addr.append(token)
1725        token, value = get_addr_spec(value)
1726    angle_addr.append(token)
1727    if value and value[0] == '>':
1728        value = value[1:]
1729    else:
1730        angle_addr.defects.append(errors.InvalidHeaderDefect(
1731            "missing trailing '>' on angle-addr"))
1732    angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
1733    if value and value[0] in CFWS_LEADER:
1734        token, value = get_cfws(value)
1735        angle_addr.append(token)
1736    return angle_addr, value
1737
1738def get_display_name(value):
1739    """ display-name = phrase
1740
1741    Because this is simply a name-rule, we don't return a display-name
1742    token containing a phrase, but rather a display-name token with
1743    the content of the phrase.
1744
1745    """
1746    display_name = DisplayName()
1747    token, value = get_phrase(value)
1748    display_name.extend(token[:])
1749    display_name.defects = token.defects[:]
1750    return display_name, value
1751
1752
1753def get_name_addr(value):
1754    """ name-addr = [display-name] angle-addr
1755
1756    """
1757    name_addr = NameAddr()
1758    # Both the optional display name and the angle-addr can start with cfws.
1759    leader = None
1760    if value[0] in CFWS_LEADER:
1761        leader, value = get_cfws(value)
1762        if not value:
1763            raise errors.HeaderParseError(
1764                "expected name-addr but found '{}'".format(leader))
1765    if value[0] != '<':
1766        if value[0] in PHRASE_ENDS:
1767            raise errors.HeaderParseError(
1768                "expected name-addr but found '{}'".format(value))
1769        token, value = get_display_name(value)
1770        if not value:
1771            raise errors.HeaderParseError(
1772                "expected name-addr but found '{}'".format(token))
1773        if leader is not None:
1774            token[0][:0] = [leader]
1775            leader = None
1776        name_addr.append(token)
1777    token, value = get_angle_addr(value)
1778    if leader is not None:
1779        token[:0] = [leader]
1780    name_addr.append(token)
1781    return name_addr, value
1782
1783def get_mailbox(value):
1784    """ mailbox = name-addr / addr-spec
1785
1786    """
1787    # The only way to figure out if we are dealing with a name-addr or an
1788    # addr-spec is to try parsing each one.
1789    mailbox = Mailbox()
1790    try:
1791        token, value = get_name_addr(value)
1792    except errors.HeaderParseError:
1793        try:
1794            token, value = get_addr_spec(value)
1795        except errors.HeaderParseError:
1796            raise errors.HeaderParseError(
1797                "expected mailbox but found '{}'".format(value))
1798    if any(isinstance(x, errors.InvalidHeaderDefect)
1799                       for x in token.all_defects):
1800        mailbox.token_type = 'invalid-mailbox'
1801    mailbox.append(token)
1802    return mailbox, value
1803
1804def get_invalid_mailbox(value, endchars):
1805    """ Read everything up to one of the chars in endchars.
1806
1807    This is outside the formal grammar.  The InvalidMailbox TokenList that is
1808    returned acts like a Mailbox, but the data attributes are None.
1809
1810    """
1811    invalid_mailbox = InvalidMailbox()
1812    while value and value[0] not in endchars:
1813        if value[0] in PHRASE_ENDS:
1814            invalid_mailbox.append(ValueTerminal(value[0],
1815                                                 'misplaced-special'))
1816            value = value[1:]
1817        else:
1818            token, value = get_phrase(value)
1819            invalid_mailbox.append(token)
1820    return invalid_mailbox, value
1821
1822def get_mailbox_list(value):
1823    """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
1824        obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])
1825
1826    For this routine we go outside the formal grammar in order to improve error
1827    handling.  We recognize the end of the mailbox list only at the end of the
1828    value or at a ';' (the group terminator).  This is so that we can turn
1829    invalid mailboxes into InvalidMailbox tokens and continue parsing any
1830    remaining valid mailboxes.  We also allow all mailbox entries to be null,
1831    and this condition is handled appropriately at a higher level.
1832
1833    """
1834    mailbox_list = MailboxList()
1835    while value and value[0] != ';':
1836        try:
1837            token, value = get_mailbox(value)
1838            mailbox_list.append(token)
1839        except errors.HeaderParseError:
1840            leader = None
1841            if value[0] in CFWS_LEADER:
1842                leader, value = get_cfws(value)
1843                if not value or value[0] in ',;':
1844                    mailbox_list.append(leader)
1845                    mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
1846                        "empty element in mailbox-list"))
1847                else:
1848                    token, value = get_invalid_mailbox(value, ',;')
1849                    if leader is not None:
1850                        token[:0] = [leader]
1851                    mailbox_list.append(token)
1852                    mailbox_list.defects.append(errors.InvalidHeaderDefect(
1853                        "invalid mailbox in mailbox-list"))
1854            elif value[0] == ',':
1855                mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
1856                    "empty element in mailbox-list"))
1857            else:
1858                token, value = get_invalid_mailbox(value, ',;')
1859                if leader is not None:
1860                    token[:0] = [leader]
1861                mailbox_list.append(token)
1862                mailbox_list.defects.append(errors.InvalidHeaderDefect(
1863                    "invalid mailbox in mailbox-list"))
1864        if value and value[0] not in ',;':
1865            # Crap after mailbox; treat it as an invalid mailbox.
1866            # The mailbox info will still be available.
1867            mailbox = mailbox_list[-1]
1868            mailbox.token_type = 'invalid-mailbox'
1869            token, value = get_invalid_mailbox(value, ',;')
1870            mailbox.extend(token)
1871            mailbox_list.defects.append(errors.InvalidHeaderDefect(
1872                "invalid mailbox in mailbox-list"))
1873        if value and value[0] == ',':
1874            mailbox_list.append(ListSeparator)
1875            value = value[1:]
1876    return mailbox_list, value
1877
1878
1879def get_group_list(value):
1880    """ group-list = mailbox-list / CFWS / obs-group-list
1881        obs-group-list = 1*([CFWS] ",") [CFWS]
1882
1883    """
1884    group_list = GroupList()
1885    if not value:
1886        group_list.defects.append(errors.InvalidHeaderDefect(
1887            "end of header before group-list"))
1888        return group_list, value
1889    leader = None
1890    if value and value[0] in CFWS_LEADER:
1891        leader, value = get_cfws(value)
1892        if not value:
1893            # This should never happen in email parsing, since CFWS-only is a
1894            # legal alternative to group-list in a group, which is the only
1895            # place group-list appears.
1896            group_list.defects.append(errors.InvalidHeaderDefect(
1897                "end of header in group-list"))
1898            group_list.append(leader)
1899            return group_list, value
1900        if value[0] == ';':
1901            group_list.append(leader)
1902            return group_list, value
1903    token, value = get_mailbox_list(value)
1904    if len(token.all_mailboxes)==0:
1905        if leader is not None:
1906            group_list.append(leader)
1907        group_list.extend(token)
1908        group_list.defects.append(errors.ObsoleteHeaderDefect(
1909            "group-list with empty entries"))
1910        return group_list, value
1911    if leader is not None:
1912        token[:0] = [leader]
1913    group_list.append(token)
1914    return group_list, value
1915
1916def get_group(value):
1917    """ group = display-name ":" [group-list] ";" [CFWS]
1918
1919    """
1920    group = Group()
1921    token, value = get_display_name(value)
1922    if not value or value[0] != ':':
1923        raise errors.HeaderParseError("expected ':' at end of group "
1924            "display name but found '{}'".format(value))
1925    group.append(token)
1926    group.append(ValueTerminal(':', 'group-display-name-terminator'))
1927    value = value[1:]
1928    if value and value[0] == ';':
1929        group.append(ValueTerminal(';', 'group-terminator'))
1930        return group, value[1:]
1931    token, value = get_group_list(value)
1932    group.append(token)
1933    if not value:
1934        group.defects.append(errors.InvalidHeaderDefect(
1935            "end of header in group"))
1936    elif value[0] != ';':
1937        raise errors.HeaderParseError(
1938            "expected ';' at end of group but found {}".format(value))
1939    group.append(ValueTerminal(';', 'group-terminator'))
1940    value = value[1:]
1941    if value and value[0] in CFWS_LEADER:
1942        token, value = get_cfws(value)
1943        group.append(token)
1944    return group, value
1945
1946def get_address(value):
1947    """ address = mailbox / group
1948
1949    Note that counter-intuitively, an address can be either a single address or
1950    a list of addresses (a group).  This is why the returned Address object has
1951    a 'mailboxes' attribute which treats a single address as a list of length
1952    one.  When you need to differentiate between to two cases, extract the single
1953    element, which is either a mailbox or a group token.
1954
1955    """
1956    # The formal grammar isn't very helpful when parsing an address.  mailbox
1957    # and group, especially when allowing for obsolete forms, start off very
1958    # similarly.  It is only when you reach one of @, <, or : that you know
1959    # what you've got.  So, we try each one in turn, starting with the more
1960    # likely of the two.  We could perhaps make this more efficient by looking
1961    # for a phrase and then branching based on the next character, but that
1962    # would be a premature optimization.
1963    address = Address()
1964    try:
1965        token, value = get_group(value)
1966    except errors.HeaderParseError:
1967        try:
1968            token, value = get_mailbox(value)
1969        except errors.HeaderParseError:
1970            raise errors.HeaderParseError(
1971                "expected address but found '{}'".format(value))
1972    address.append(token)
1973    return address, value
1974
1975def get_address_list(value):
1976    """ address_list = (address *("," address)) / obs-addr-list
1977        obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])
1978
1979    We depart from the formal grammar here by continuing to parse until the end
1980    of the input, assuming the input to be entirely composed of an
1981    address-list.  This is always true in email parsing, and allows us
1982    to skip invalid addresses to parse additional valid ones.
1983
1984    """
1985    address_list = AddressList()
1986    while value:
1987        try:
1988            token, value = get_address(value)
1989            address_list.append(token)
1990        except errors.HeaderParseError as err:
1991            leader = None
1992            if value[0] in CFWS_LEADER:
1993                leader, value = get_cfws(value)
1994                if not value or value[0] == ',':
1995                    address_list.append(leader)
1996                    address_list.defects.append(errors.ObsoleteHeaderDefect(
1997                        "address-list entry with no content"))
1998                else:
1999                    token, value = get_invalid_mailbox(value, ',')
2000                    if leader is not None:
2001                        token[:0] = [leader]
2002                    address_list.append(Address([token]))
2003                    address_list.defects.append(errors.InvalidHeaderDefect(
2004                        "invalid address in address-list"))
2005            elif value[0] == ',':
2006                address_list.defects.append(errors.ObsoleteHeaderDefect(
2007                    "empty element in address-list"))
2008            else:
2009                token, value = get_invalid_mailbox(value, ',')
2010                if leader is not None:
2011                    token[:0] = [leader]
2012                address_list.append(Address([token]))
2013                address_list.defects.append(errors.InvalidHeaderDefect(
2014                    "invalid address in address-list"))
2015        if value and value[0] != ',':
2016            # Crap after address; treat it as an invalid mailbox.
2017            # The mailbox info will still be available.
2018            mailbox = address_list[-1][0]
2019            mailbox.token_type = 'invalid-mailbox'
2020            token, value = get_invalid_mailbox(value, ',')
2021            mailbox.extend(token)
2022            address_list.defects.append(errors.InvalidHeaderDefect(
2023                "invalid address in address-list"))
2024        if value:  # Must be a , at this point.
2025            address_list.append(ValueTerminal(',', 'list-separator'))
2026            value = value[1:]
2027    return address_list, value
2028
2029
2030def get_no_fold_literal(value):
2031    """ no-fold-literal = "[" *dtext "]"
2032    """
2033    no_fold_literal = NoFoldLiteral()
2034    if not value:
2035        raise errors.HeaderParseError(
2036            "expected no-fold-literal but found '{}'".format(value))
2037    if value[0] != '[':
2038        raise errors.HeaderParseError(
2039            "expected '[' at the start of no-fold-literal "
2040            "but found '{}'".format(value))
2041    no_fold_literal.append(ValueTerminal('[', 'no-fold-literal-start'))
2042    value = value[1:]
2043    token, value = get_dtext(value)
2044    no_fold_literal.append(token)
2045    if not value or value[0] != ']':
2046        raise errors.HeaderParseError(
2047            "expected ']' at the end of no-fold-literal "
2048            "but found '{}'".format(value))
2049    no_fold_literal.append(ValueTerminal(']', 'no-fold-literal-end'))
2050    return no_fold_literal, value[1:]
2051
2052def get_msg_id(value):
2053    """msg-id = [CFWS] "<" id-left '@' id-right  ">" [CFWS]
2054       id-left = dot-atom-text / obs-id-left
2055       id-right = dot-atom-text / no-fold-literal / obs-id-right
2056       no-fold-literal = "[" *dtext "]"
2057    """
2058    msg_id = MsgID()
2059    if value and value[0] in CFWS_LEADER:
2060        token, value = get_cfws(value)
2061        msg_id.append(token)
2062    if not value or value[0] != '<':
2063        raise errors.HeaderParseError(
2064            "expected msg-id but found '{}'".format(value))
2065    msg_id.append(ValueTerminal('<', 'msg-id-start'))
2066    value = value[1:]
2067    # Parse id-left.
2068    try:
2069        token, value = get_dot_atom_text(value)
2070    except errors.HeaderParseError:
2071        try:
2072            # obs-id-left is same as local-part of add-spec.
2073            token, value = get_obs_local_part(value)
2074            msg_id.defects.append(errors.ObsoleteHeaderDefect(
2075                "obsolete id-left in msg-id"))
2076        except errors.HeaderParseError:
2077            raise errors.HeaderParseError(
2078                "expected dot-atom-text or obs-id-left"
2079                " but found '{}'".format(value))
2080    msg_id.append(token)
2081    if not value or value[0] != '@':
2082        msg_id.defects.append(errors.InvalidHeaderDefect(
2083            "msg-id with no id-right"))
2084        # Even though there is no id-right, if the local part
2085        # ends with `>` let's just parse it too and return
2086        # along with the defect.
2087        if value and value[0] == '>':
2088            msg_id.append(ValueTerminal('>', 'msg-id-end'))
2089            value = value[1:]
2090        return msg_id, value
2091    msg_id.append(ValueTerminal('@', 'address-at-symbol'))
2092    value = value[1:]
2093    # Parse id-right.
2094    try:
2095        token, value = get_dot_atom_text(value)
2096    except errors.HeaderParseError:
2097        try:
2098            token, value = get_no_fold_literal(value)
2099        except errors.HeaderParseError as e:
2100            try:
2101                token, value = get_domain(value)
2102                msg_id.defects.append(errors.ObsoleteHeaderDefect(
2103                    "obsolete id-right in msg-id"))
2104            except errors.HeaderParseError:
2105                raise errors.HeaderParseError(
2106                    "expected dot-atom-text, no-fold-literal or obs-id-right"
2107                    " but found '{}'".format(value))
2108    msg_id.append(token)
2109    if value and value[0] == '>':
2110        value = value[1:]
2111    else:
2112        msg_id.defects.append(errors.InvalidHeaderDefect(
2113            "missing trailing '>' on msg-id"))
2114    msg_id.append(ValueTerminal('>', 'msg-id-end'))
2115    if value and value[0] in CFWS_LEADER:
2116        token, value = get_cfws(value)
2117        msg_id.append(token)
2118    return msg_id, value
2119
2120
2121def parse_message_id(value):
2122    """message-id      =   "Message-ID:" msg-id CRLF
2123    """
2124    message_id = MessageID()
2125    try:
2126        token, value = get_msg_id(value)
2127        message_id.append(token)
2128    except errors.HeaderParseError as ex:
2129        token = get_unstructured(value)
2130        message_id = InvalidMessageID(token)
2131        message_id.defects.append(
2132            errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(ex)))
2133    else:
2134        # Value after parsing a valid msg_id should be None.
2135        if value:
2136            message_id.defects.append(errors.InvalidHeaderDefect(
2137                "Unexpected {!r}".format(value)))
2138
2139    return message_id
2140
2141#
2142# XXX: As I begin to add additional header parsers, I'm realizing we probably
2143# have two level of parser routines: the get_XXX methods that get a token in
2144# the grammar, and parse_XXX methods that parse an entire field value.  So
2145# get_address_list above should really be a parse_ method, as probably should
2146# be get_unstructured.
2147#
2148
2149def parse_mime_version(value):
2150    """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS]
2151
2152    """
2153    # The [CFWS] is implicit in the RFC 2045 BNF.
2154    # XXX: This routine is a bit verbose, should factor out a get_int method.
2155    mime_version = MIMEVersion()
2156    if not value:
2157        mime_version.defects.append(errors.HeaderMissingRequiredValue(
2158            "Missing MIME version number (eg: 1.0)"))
2159        return mime_version
2160    if value[0] in CFWS_LEADER:
2161        token, value = get_cfws(value)
2162        mime_version.append(token)
2163        if not value:
2164            mime_version.defects.append(errors.HeaderMissingRequiredValue(
2165                "Expected MIME version number but found only CFWS"))
2166    digits = ''
2167    while value and value[0] != '.' and value[0] not in CFWS_LEADER:
2168        digits += value[0]
2169        value = value[1:]
2170    if not digits.isdigit():
2171        mime_version.defects.append(errors.InvalidHeaderDefect(
2172            "Expected MIME major version number but found {!r}".format(digits)))
2173        mime_version.append(ValueTerminal(digits, 'xtext'))
2174    else:
2175        mime_version.major = int(digits)
2176        mime_version.append(ValueTerminal(digits, 'digits'))
2177    if value and value[0] in CFWS_LEADER:
2178        token, value = get_cfws(value)
2179        mime_version.append(token)
2180    if not value or value[0] != '.':
2181        if mime_version.major is not None:
2182            mime_version.defects.append(errors.InvalidHeaderDefect(
2183                "Incomplete MIME version; found only major number"))
2184        if value:
2185            mime_version.append(ValueTerminal(value, 'xtext'))
2186        return mime_version
2187    mime_version.append(ValueTerminal('.', 'version-separator'))
2188    value = value[1:]
2189    if value and value[0] in CFWS_LEADER:
2190        token, value = get_cfws(value)
2191        mime_version.append(token)
2192    if not value:
2193        if mime_version.major is not None:
2194            mime_version.defects.append(errors.InvalidHeaderDefect(
2195                "Incomplete MIME version; found only major number"))
2196        return mime_version
2197    digits = ''
2198    while value and value[0] not in CFWS_LEADER:
2199        digits += value[0]
2200        value = value[1:]
2201    if not digits.isdigit():
2202        mime_version.defects.append(errors.InvalidHeaderDefect(
2203            "Expected MIME minor version number but found {!r}".format(digits)))
2204        mime_version.append(ValueTerminal(digits, 'xtext'))
2205    else:
2206        mime_version.minor = int(digits)
2207        mime_version.append(ValueTerminal(digits, 'digits'))
2208    if value and value[0] in CFWS_LEADER:
2209        token, value = get_cfws(value)
2210        mime_version.append(token)
2211    if value:
2212        mime_version.defects.append(errors.InvalidHeaderDefect(
2213            "Excess non-CFWS text after MIME version"))
2214        mime_version.append(ValueTerminal(value, 'xtext'))
2215    return mime_version
2216
2217def get_invalid_parameter(value):
2218    """ Read everything up to the next ';'.
2219
2220    This is outside the formal grammar.  The InvalidParameter TokenList that is
2221    returned acts like a Parameter, but the data attributes are None.
2222
2223    """
2224    invalid_parameter = InvalidParameter()
2225    while value and value[0] != ';':
2226        if value[0] in PHRASE_ENDS:
2227            invalid_parameter.append(ValueTerminal(value[0],
2228                                                   'misplaced-special'))
2229            value = value[1:]
2230        else:
2231            token, value = get_phrase(value)
2232            invalid_parameter.append(token)
2233    return invalid_parameter, value
2234
2235def get_ttext(value):
2236    """ttext = <matches _ttext_matcher>
2237
2238    We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
2239    defects list if we find non-ttext characters.  We also register defects for
2240    *any* non-printables even though the RFC doesn't exclude all of them,
2241    because we follow the spirit of RFC 5322.
2242
2243    """
2244    m = _non_token_end_matcher(value)
2245    if not m:
2246        raise errors.HeaderParseError(
2247            "expected ttext but found '{}'".format(value))
2248    ttext = m.group()
2249    value = value[len(ttext):]
2250    ttext = ValueTerminal(ttext, 'ttext')
2251    _validate_xtext(ttext)
2252    return ttext, value
2253
2254def get_token(value):
2255    """token = [CFWS] 1*ttext [CFWS]
2256
2257    The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
2258    tspecials.  We also exclude tabs even though the RFC doesn't.
2259
2260    The RFC implies the CFWS but is not explicit about it in the BNF.
2261
2262    """
2263    mtoken = Token()
2264    if value and value[0] in CFWS_LEADER:
2265        token, value = get_cfws(value)
2266        mtoken.append(token)
2267    if value and value[0] in TOKEN_ENDS:
2268        raise errors.HeaderParseError(
2269            "expected token but found '{}'".format(value))
2270    token, value = get_ttext(value)
2271    mtoken.append(token)
2272    if value and value[0] in CFWS_LEADER:
2273        token, value = get_cfws(value)
2274        mtoken.append(token)
2275    return mtoken, value
2276
2277def get_attrtext(value):
2278    """attrtext = 1*(any non-ATTRIBUTE_ENDS character)
2279
2280    We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
2281    token's defects list if we find non-attrtext characters.  We also register
2282    defects for *any* non-printables even though the RFC doesn't exclude all of
2283    them, because we follow the spirit of RFC 5322.
2284
2285    """
2286    m = _non_attribute_end_matcher(value)
2287    if not m:
2288        raise errors.HeaderParseError(
2289            "expected attrtext but found {!r}".format(value))
2290    attrtext = m.group()
2291    value = value[len(attrtext):]
2292    attrtext = ValueTerminal(attrtext, 'attrtext')
2293    _validate_xtext(attrtext)
2294    return attrtext, value
2295
2296def get_attribute(value):
2297    """ [CFWS] 1*attrtext [CFWS]
2298
2299    This version of the BNF makes the CFWS explicit, and as usual we use a
2300    value terminal for the actual run of characters.  The RFC equivalent of
2301    attrtext is the token characters, with the subtraction of '*', "'", and '%'.
2302    We include tab in the excluded set just as we do for token.
2303
2304    """
2305    attribute = Attribute()
2306    if value and value[0] in CFWS_LEADER:
2307        token, value = get_cfws(value)
2308        attribute.append(token)
2309    if value and value[0] in ATTRIBUTE_ENDS:
2310        raise errors.HeaderParseError(
2311            "expected token but found '{}'".format(value))
2312    token, value = get_attrtext(value)
2313    attribute.append(token)
2314    if value and value[0] in CFWS_LEADER:
2315        token, value = get_cfws(value)
2316        attribute.append(token)
2317    return attribute, value
2318
2319def get_extended_attrtext(value):
2320    """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
2321
2322    This is a special parsing routine so that we get a value that
2323    includes % escapes as a single string (which we decode as a single
2324    string later).
2325
2326    """
2327    m = _non_extended_attribute_end_matcher(value)
2328    if not m:
2329        raise errors.HeaderParseError(
2330            "expected extended attrtext but found {!r}".format(value))
2331    attrtext = m.group()
2332    value = value[len(attrtext):]
2333    attrtext = ValueTerminal(attrtext, 'extended-attrtext')
2334    _validate_xtext(attrtext)
2335    return attrtext, value
2336
2337def get_extended_attribute(value):
2338    """ [CFWS] 1*extended_attrtext [CFWS]
2339
2340    This is like the non-extended version except we allow % characters, so that
2341    we can pick up an encoded value as a single string.
2342
2343    """
2344    # XXX: should we have an ExtendedAttribute TokenList?
2345    attribute = Attribute()
2346    if value and value[0] in CFWS_LEADER:
2347        token, value = get_cfws(value)
2348        attribute.append(token)
2349    if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
2350        raise errors.HeaderParseError(
2351            "expected token but found '{}'".format(value))
2352    token, value = get_extended_attrtext(value)
2353    attribute.append(token)
2354    if value and value[0] in CFWS_LEADER:
2355        token, value = get_cfws(value)
2356        attribute.append(token)
2357    return attribute, value
2358
2359def get_section(value):
2360    """ '*' digits
2361
2362    The formal BNF is more complicated because leading 0s are not allowed.  We
2363    check for that and add a defect.  We also assume no CFWS is allowed between
2364    the '*' and the digits, though the RFC is not crystal clear on that.
2365    The caller should already have dealt with leading CFWS.
2366
2367    """
2368    section = Section()
2369    if not value or value[0] != '*':
2370        raise errors.HeaderParseError("Expected section but found {}".format(
2371                                        value))
2372    section.append(ValueTerminal('*', 'section-marker'))
2373    value = value[1:]
2374    if not value or not value[0].isdigit():
2375        raise errors.HeaderParseError("Expected section number but "
2376                                      "found {}".format(value))
2377    digits = ''
2378    while value and value[0].isdigit():
2379        digits += value[0]
2380        value = value[1:]
2381    if digits[0] == '0' and digits != '0':
2382        section.defects.append(errors.InvalidHeaderError(
2383                "section number has an invalid leading 0"))
2384    section.number = int(digits)
2385    section.append(ValueTerminal(digits, 'digits'))
2386    return section, value
2387
2388
2389def get_value(value):
2390    """ quoted-string / attribute
2391
2392    """
2393    v = Value()
2394    if not value:
2395        raise errors.HeaderParseError("Expected value but found end of string")
2396    leader = None
2397    if value[0] in CFWS_LEADER:
2398        leader, value = get_cfws(value)
2399    if not value:
2400        raise errors.HeaderParseError("Expected value but found "
2401                                      "only {}".format(leader))
2402    if value[0] == '"':
2403        token, value = get_quoted_string(value)
2404    else:
2405        token, value = get_extended_attribute(value)
2406    if leader is not None:
2407        token[:0] = [leader]
2408    v.append(token)
2409    return v, value
2410
2411def get_parameter(value):
2412    """ attribute [section] ["*"] [CFWS] "=" value
2413
2414    The CFWS is implied by the RFC but not made explicit in the BNF.  This
2415    simplified form of the BNF from the RFC is made to conform with the RFC BNF
2416    through some extra checks.  We do it this way because it makes both error
2417    recovery and working with the resulting parse tree easier.
2418    """
2419    # It is possible CFWS would also be implicitly allowed between the section
2420    # and the 'extended-attribute' marker (the '*') , but we've never seen that
2421    # in the wild and we will therefore ignore the possibility.
2422    param = Parameter()
2423    token, value = get_attribute(value)
2424    param.append(token)
2425    if not value or value[0] == ';':
2426        param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
2427            "name ({}) but no value".format(token)))
2428        return param, value
2429    if value[0] == '*':
2430        try:
2431            token, value = get_section(value)
2432            param.sectioned = True
2433            param.append(token)
2434        except errors.HeaderParseError:
2435            pass
2436        if not value:
2437            raise errors.HeaderParseError("Incomplete parameter")
2438        if value[0] == '*':
2439            param.append(ValueTerminal('*', 'extended-parameter-marker'))
2440            value = value[1:]
2441            param.extended = True
2442    if value[0] != '=':
2443        raise errors.HeaderParseError("Parameter not followed by '='")
2444    param.append(ValueTerminal('=', 'parameter-separator'))
2445    value = value[1:]
2446    leader = None
2447    if value and value[0] in CFWS_LEADER:
2448        token, value = get_cfws(value)
2449        param.append(token)
2450    remainder = None
2451    appendto = param
2452    if param.extended and value and value[0] == '"':
2453        # Now for some serious hackery to handle the common invalid case of
2454        # double quotes around an extended value.  We also accept (with defect)
2455        # a value marked as encoded that isn't really.
2456        qstring, remainder = get_quoted_string(value)
2457        inner_value = qstring.stripped_value
2458        semi_valid = False
2459        if param.section_number == 0:
2460            if inner_value and inner_value[0] == "'":
2461                semi_valid = True
2462            else:
2463                token, rest = get_attrtext(inner_value)
2464                if rest and rest[0] == "'":
2465                    semi_valid = True
2466        else:
2467            try:
2468                token, rest = get_extended_attrtext(inner_value)
2469            except:
2470                pass
2471            else:
2472                if not rest:
2473                    semi_valid = True
2474        if semi_valid:
2475            param.defects.append(errors.InvalidHeaderDefect(
2476                "Quoted string value for extended parameter is invalid"))
2477            param.append(qstring)
2478            for t in qstring:
2479                if t.token_type == 'bare-quoted-string':
2480                    t[:] = []
2481                    appendto = t
2482                    break
2483            value = inner_value
2484        else:
2485            remainder = None
2486            param.defects.append(errors.InvalidHeaderDefect(
2487                "Parameter marked as extended but appears to have a "
2488                "quoted string value that is non-encoded"))
2489    if value and value[0] == "'":
2490        token = None
2491    else:
2492        token, value = get_value(value)
2493    if not param.extended or param.section_number > 0:
2494        if not value or value[0] != "'":
2495            appendto.append(token)
2496            if remainder is not None:
2497                assert not value, value
2498                value = remainder
2499            return param, value
2500        param.defects.append(errors.InvalidHeaderDefect(
2501            "Apparent initial-extended-value but attribute "
2502            "was not marked as extended or was not initial section"))
2503    if not value:
2504        # Assume the charset/lang is missing and the token is the value.
2505        param.defects.append(errors.InvalidHeaderDefect(
2506            "Missing required charset/lang delimiters"))
2507        appendto.append(token)
2508        if remainder is None:
2509            return param, value
2510    else:
2511        if token is not None:
2512            for t in token:
2513                if t.token_type == 'extended-attrtext':
2514                    break
2515            t.token_type == 'attrtext'
2516            appendto.append(t)
2517            param.charset = t.value
2518        if value[0] != "'":
2519            raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2520                                          "delimiter, but found {!r}".format(value))
2521        appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
2522        value = value[1:]
2523        if value and value[0] != "'":
2524            token, value = get_attrtext(value)
2525            appendto.append(token)
2526            param.lang = token.value
2527            if not value or value[0] != "'":
2528                raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2529                                  "delimiter, but found {}".format(value))
2530        appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
2531        value = value[1:]
2532    if remainder is not None:
2533        # Treat the rest of value as bare quoted string content.
2534        v = Value()
2535        while value:
2536            if value[0] in WSP:
2537                token, value = get_fws(value)
2538            elif value[0] == '"':
2539                token = ValueTerminal('"', 'DQUOTE')
2540                value = value[1:]
2541            else:
2542                token, value = get_qcontent(value)
2543            v.append(token)
2544        token = v
2545    else:
2546        token, value = get_value(value)
2547    appendto.append(token)
2548    if remainder is not None:
2549        assert not value, value
2550        value = remainder
2551    return param, value
2552
2553def parse_mime_parameters(value):
2554    """ parameter *( ";" parameter )
2555
2556    That BNF is meant to indicate this routine should only be called after
2557    finding and handling the leading ';'.  There is no corresponding rule in
2558    the formal RFC grammar, but it is more convenient for us for the set of
2559    parameters to be treated as its own TokenList.
2560
2561    This is 'parse' routine because it consumes the remaining value, but it
2562    would never be called to parse a full header.  Instead it is called to
2563    parse everything after the non-parameter value of a specific MIME header.
2564
2565    """
2566    mime_parameters = MimeParameters()
2567    while value:
2568        try:
2569            token, value = get_parameter(value)
2570            mime_parameters.append(token)
2571        except errors.HeaderParseError as err:
2572            leader = None
2573            if value[0] in CFWS_LEADER:
2574                leader, value = get_cfws(value)
2575            if not value:
2576                mime_parameters.append(leader)
2577                return mime_parameters
2578            if value[0] == ';':
2579                if leader is not None:
2580                    mime_parameters.append(leader)
2581                mime_parameters.defects.append(errors.InvalidHeaderDefect(
2582                    "parameter entry with no content"))
2583            else:
2584                token, value = get_invalid_parameter(value)
2585                if leader:
2586                    token[:0] = [leader]
2587                mime_parameters.append(token)
2588                mime_parameters.defects.append(errors.InvalidHeaderDefect(
2589                    "invalid parameter {!r}".format(token)))
2590        if value and value[0] != ';':
2591            # Junk after the otherwise valid parameter.  Mark it as
2592            # invalid, but it will have a value.
2593            param = mime_parameters[-1]
2594            param.token_type = 'invalid-parameter'
2595            token, value = get_invalid_parameter(value)
2596            param.extend(token)
2597            mime_parameters.defects.append(errors.InvalidHeaderDefect(
2598                "parameter with invalid trailing text {!r}".format(token)))
2599        if value:
2600            # Must be a ';' at this point.
2601            mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
2602            value = value[1:]
2603    return mime_parameters
2604
2605def _find_mime_parameters(tokenlist, value):
2606    """Do our best to find the parameters in an invalid MIME header
2607
2608    """
2609    while value and value[0] != ';':
2610        if value[0] in PHRASE_ENDS:
2611            tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
2612            value = value[1:]
2613        else:
2614            token, value = get_phrase(value)
2615            tokenlist.append(token)
2616    if not value:
2617        return
2618    tokenlist.append(ValueTerminal(';', 'parameter-separator'))
2619    tokenlist.append(parse_mime_parameters(value[1:]))
2620
2621def parse_content_type_header(value):
2622    """ maintype "/" subtype *( ";" parameter )
2623
2624    The maintype and substype are tokens.  Theoretically they could
2625    be checked against the official IANA list + x-token, but we
2626    don't do that.
2627    """
2628    ctype = ContentType()
2629    recover = False
2630    if not value:
2631        ctype.defects.append(errors.HeaderMissingRequiredValue(
2632            "Missing content type specification"))
2633        return ctype
2634    try:
2635        token, value = get_token(value)
2636    except errors.HeaderParseError:
2637        ctype.defects.append(errors.InvalidHeaderDefect(
2638            "Expected content maintype but found {!r}".format(value)))
2639        _find_mime_parameters(ctype, value)
2640        return ctype
2641    ctype.append(token)
2642    # XXX: If we really want to follow the formal grammar we should make
2643    # mantype and subtype specialized TokenLists here.  Probably not worth it.
2644    if not value or value[0] != '/':
2645        ctype.defects.append(errors.InvalidHeaderDefect(
2646            "Invalid content type"))
2647        if value:
2648            _find_mime_parameters(ctype, value)
2649        return ctype
2650    ctype.maintype = token.value.strip().lower()
2651    ctype.append(ValueTerminal('/', 'content-type-separator'))
2652    value = value[1:]
2653    try:
2654        token, value = get_token(value)
2655    except errors.HeaderParseError:
2656        ctype.defects.append(errors.InvalidHeaderDefect(
2657            "Expected content subtype but found {!r}".format(value)))
2658        _find_mime_parameters(ctype, value)
2659        return ctype
2660    ctype.append(token)
2661    ctype.subtype = token.value.strip().lower()
2662    if not value:
2663        return ctype
2664    if value[0] != ';':
2665        ctype.defects.append(errors.InvalidHeaderDefect(
2666            "Only parameters are valid after content type, but "
2667            "found {!r}".format(value)))
2668        # The RFC requires that a syntactically invalid content-type be treated
2669        # as text/plain.  Perhaps we should postel this, but we should probably
2670        # only do that if we were checking the subtype value against IANA.
2671        del ctype.maintype, ctype.subtype
2672        _find_mime_parameters(ctype, value)
2673        return ctype
2674    ctype.append(ValueTerminal(';', 'parameter-separator'))
2675    ctype.append(parse_mime_parameters(value[1:]))
2676    return ctype
2677
2678def parse_content_disposition_header(value):
2679    """ disposition-type *( ";" parameter )
2680
2681    """
2682    disp_header = ContentDisposition()
2683    if not value:
2684        disp_header.defects.append(errors.HeaderMissingRequiredValue(
2685            "Missing content disposition"))
2686        return disp_header
2687    try:
2688        token, value = get_token(value)
2689    except errors.HeaderParseError:
2690        disp_header.defects.append(errors.InvalidHeaderDefect(
2691            "Expected content disposition but found {!r}".format(value)))
2692        _find_mime_parameters(disp_header, value)
2693        return disp_header
2694    disp_header.append(token)
2695    disp_header.content_disposition = token.value.strip().lower()
2696    if not value:
2697        return disp_header
2698    if value[0] != ';':
2699        disp_header.defects.append(errors.InvalidHeaderDefect(
2700            "Only parameters are valid after content disposition, but "
2701            "found {!r}".format(value)))
2702        _find_mime_parameters(disp_header, value)
2703        return disp_header
2704    disp_header.append(ValueTerminal(';', 'parameter-separator'))
2705    disp_header.append(parse_mime_parameters(value[1:]))
2706    return disp_header
2707
2708def parse_content_transfer_encoding_header(value):
2709    """ mechanism
2710
2711    """
2712    # We should probably validate the values, since the list is fixed.
2713    cte_header = ContentTransferEncoding()
2714    if not value:
2715        cte_header.defects.append(errors.HeaderMissingRequiredValue(
2716            "Missing content transfer encoding"))
2717        return cte_header
2718    try:
2719        token, value = get_token(value)
2720    except errors.HeaderParseError:
2721        cte_header.defects.append(errors.InvalidHeaderDefect(
2722            "Expected content transfer encoding but found {!r}".format(value)))
2723    else:
2724        cte_header.append(token)
2725        cte_header.cte = token.value.strip().lower()
2726    if not value:
2727        return cte_header
2728    while value:
2729        cte_header.defects.append(errors.InvalidHeaderDefect(
2730            "Extra text after content transfer encoding"))
2731        if value[0] in PHRASE_ENDS:
2732            cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
2733            value = value[1:]
2734        else:
2735            token, value = get_phrase(value)
2736            cte_header.append(token)
2737    return cte_header
2738
2739
2740#
2741# Header folding
2742#
2743# Header folding is complex, with lots of rules and corner cases.  The
2744# following code does its best to obey the rules and handle the corner
2745# cases, but you can be sure there are few bugs:)
2746#
2747# This folder generally canonicalizes as it goes, preferring the stringified
2748# version of each token.  The tokens contain information that supports the
2749# folder, including which tokens can be encoded in which ways.
2750#
2751# Folded text is accumulated in a simple list of strings ('lines'), each
2752# one of which should be less than policy.max_line_length ('maxlen').
2753#
2754
2755def _steal_trailing_WSP_if_exists(lines):
2756    wsp = ''
2757    if lines and lines[-1] and lines[-1][-1] in WSP:
2758        wsp = lines[-1][-1]
2759        lines[-1] = lines[-1][:-1]
2760    return wsp
2761
2762def _refold_parse_tree(parse_tree, *, policy):
2763    """Return string of contents of parse_tree folded according to RFC rules.
2764
2765    """
2766    # max_line_length 0/None means no limit, ie: infinitely long.
2767    maxlen = policy.max_line_length or sys.maxsize
2768    encoding = 'utf-8' if policy.utf8 else 'us-ascii'
2769    lines = ['']
2770    last_ew = None
2771    wrap_as_ew_blocked = 0
2772    want_encoding = False
2773    end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
2774    parts = list(parse_tree)
2775    while parts:
2776        part = parts.pop(0)
2777        if part is end_ew_not_allowed:
2778            wrap_as_ew_blocked -= 1
2779            continue
2780        tstr = str(part)
2781        if part.token_type == 'ptext' and set(tstr) & SPECIALS:
2782            # Encode if tstr contains special characters.
2783            want_encoding = True
2784        try:
2785            tstr.encode(encoding)
2786            charset = encoding
2787        except UnicodeEncodeError:
2788            if any(isinstance(x, errors.UndecodableBytesDefect)
2789                   for x in part.all_defects):
2790                charset = 'unknown-8bit'
2791            else:
2792                # If policy.utf8 is false this should really be taken from a
2793                # 'charset' property on the policy.
2794                charset = 'utf-8'
2795            want_encoding = True
2796        if part.token_type == 'mime-parameters':
2797            # Mime parameter folding (using RFC2231) is extra special.
2798            _fold_mime_parameters(part, lines, maxlen, encoding)
2799            continue
2800        if want_encoding and not wrap_as_ew_blocked:
2801            if not part.as_ew_allowed:
2802                want_encoding = False
2803                last_ew = None
2804                if part.syntactic_break:
2805                    encoded_part = part.fold(policy=policy)[:-len(policy.linesep)]
2806                    if policy.linesep not in encoded_part:
2807                        # It fits on a single line
2808                        if len(encoded_part) > maxlen - len(lines[-1]):
2809                            # But not on this one, so start a new one.
2810                            newline = _steal_trailing_WSP_if_exists(lines)
2811                            # XXX what if encoded_part has no leading FWS?
2812                            lines.append(newline)
2813                        lines[-1] += encoded_part
2814                        continue
2815                # Either this is not a major syntactic break, so we don't
2816                # want it on a line by itself even if it fits, or it
2817                # doesn't fit on a line by itself.  Either way, fall through
2818                # to unpacking the subparts and wrapping them.
2819            if not hasattr(part, 'encode'):
2820                # It's not a Terminal, do each piece individually.
2821                parts = list(part) + parts
2822            else:
2823                # It's a terminal, wrap it as an encoded word, possibly
2824                # combining it with previously encoded words if allowed.
2825                last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
2826                                      part.ew_combine_allowed, charset)
2827            want_encoding = False
2828            continue
2829        if len(tstr) <= maxlen - len(lines[-1]):
2830            lines[-1] += tstr
2831            continue
2832        # This part is too long to fit.  The RFC wants us to break at
2833        # "major syntactic breaks", so unless we don't consider this
2834        # to be one, check if it will fit on the next line by itself.
2835        if (part.syntactic_break and
2836                len(tstr) + 1 <= maxlen):
2837            newline = _steal_trailing_WSP_if_exists(lines)
2838            if newline or part.startswith_fws():
2839                lines.append(newline + tstr)
2840                last_ew = None
2841                continue
2842        if not hasattr(part, 'encode'):
2843            # It's not a terminal, try folding the subparts.
2844            newparts = list(part)
2845            if not part.as_ew_allowed:
2846                wrap_as_ew_blocked += 1
2847                newparts.append(end_ew_not_allowed)
2848            parts = newparts + parts
2849            continue
2850        if part.as_ew_allowed and not wrap_as_ew_blocked:
2851            # It doesn't need CTE encoding, but encode it anyway so we can
2852            # wrap it.
2853            parts.insert(0, part)
2854            want_encoding = True
2855            continue
2856        # We can't figure out how to wrap, it, so give up.
2857        newline = _steal_trailing_WSP_if_exists(lines)
2858        if newline or part.startswith_fws():
2859            lines.append(newline + tstr)
2860        else:
2861            # We can't fold it onto the next line either...
2862            lines[-1] += tstr
2863    return policy.linesep.join(lines) + policy.linesep
2864
2865def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
2866    """Fold string to_encode into lines as encoded word, combining if allowed.
2867    Return the new value for last_ew, or None if ew_combine_allowed is False.
2868
2869    If there is already an encoded word in the last line of lines (indicated by
2870    a non-None value for last_ew) and ew_combine_allowed is true, decode the
2871    existing ew, combine it with to_encode, and re-encode.  Otherwise, encode
2872    to_encode.  In either case, split to_encode as necessary so that the
2873    encoded segments fit within maxlen.
2874
2875    """
2876    if last_ew is not None and ew_combine_allowed:
2877        to_encode = str(
2878            get_unstructured(lines[-1][last_ew:] + to_encode))
2879        lines[-1] = lines[-1][:last_ew]
2880    if to_encode[0] in WSP:
2881        # We're joining this to non-encoded text, so don't encode
2882        # the leading blank.
2883        leading_wsp = to_encode[0]
2884        to_encode = to_encode[1:]
2885        if (len(lines[-1]) == maxlen):
2886            lines.append(_steal_trailing_WSP_if_exists(lines))
2887        lines[-1] += leading_wsp
2888    trailing_wsp = ''
2889    if to_encode[-1] in WSP:
2890        # Likewise for the trailing space.
2891        trailing_wsp = to_encode[-1]
2892        to_encode = to_encode[:-1]
2893    new_last_ew = len(lines[-1]) if last_ew is None else last_ew
2894
2895    encode_as = 'utf-8' if charset == 'us-ascii' else charset
2896
2897    # The RFC2047 chrome takes up 7 characters plus the length
2898    # of the charset name.
2899    chrome_len = len(encode_as) + 7
2900
2901    if (chrome_len + 1) >= maxlen:
2902        raise errors.HeaderParseError(
2903            "max_line_length is too small to fit an encoded word")
2904
2905    while to_encode:
2906        remaining_space = maxlen - len(lines[-1])
2907        text_space = remaining_space - chrome_len
2908        if text_space <= 0:
2909            lines.append(' ')
2910            continue
2911
2912        to_encode_word = to_encode[:text_space]
2913        encoded_word = _ew.encode(to_encode_word, charset=encode_as)
2914        excess = len(encoded_word) - remaining_space
2915        while excess > 0:
2916            # Since the chunk to encode is guaranteed to fit into less than 100 characters,
2917            # shrinking it by one at a time shouldn't take long.
2918            to_encode_word = to_encode_word[:-1]
2919            encoded_word = _ew.encode(to_encode_word, charset=encode_as)
2920            excess = len(encoded_word) - remaining_space
2921        lines[-1] += encoded_word
2922        to_encode = to_encode[len(to_encode_word):]
2923
2924        if to_encode:
2925            lines.append(' ')
2926            new_last_ew = len(lines[-1])
2927    lines[-1] += trailing_wsp
2928    return new_last_ew if ew_combine_allowed else None
2929
2930def _fold_mime_parameters(part, lines, maxlen, encoding):
2931    """Fold TokenList 'part' into the 'lines' list as mime parameters.
2932
2933    Using the decoded list of parameters and values, format them according to
2934    the RFC rules, including using RFC2231 encoding if the value cannot be
2935    expressed in 'encoding' and/or the parameter+value is too long to fit
2936    within 'maxlen'.
2937
2938    """
2939    # Special case for RFC2231 encoding: start from decoded values and use
2940    # RFC2231 encoding iff needed.
2941    #
2942    # Note that the 1 and 2s being added to the length calculations are
2943    # accounting for the possibly-needed spaces and semicolons we'll be adding.
2944    #
2945    for name, value in part.params:
2946        # XXX What if this ';' puts us over maxlen the first time through the
2947        # loop?  We should split the header value onto a newline in that case,
2948        # but to do that we need to recognize the need earlier or reparse the
2949        # header, so I'm going to ignore that bug for now.  It'll only put us
2950        # one character over.
2951        if not lines[-1].rstrip().endswith(';'):
2952            lines[-1] += ';'
2953        charset = encoding
2954        error_handler = 'strict'
2955        try:
2956            value.encode(encoding)
2957            encoding_required = False
2958        except UnicodeEncodeError:
2959            encoding_required = True
2960            if utils._has_surrogates(value):
2961                charset = 'unknown-8bit'
2962                error_handler = 'surrogateescape'
2963            else:
2964                charset = 'utf-8'
2965        if encoding_required:
2966            encoded_value = urllib.parse.quote(
2967                value, safe='', errors=error_handler)
2968            tstr = "{}*={}''{}".format(name, charset, encoded_value)
2969        else:
2970            tstr = '{}={}'.format(name, quote_string(value))
2971        if len(lines[-1]) + len(tstr) + 1 < maxlen:
2972            lines[-1] = lines[-1] + ' ' + tstr
2973            continue
2974        elif len(tstr) + 2 <= maxlen:
2975            lines.append(' ' + tstr)
2976            continue
2977        # We need multiple sections.  We are allowed to mix encoded and
2978        # non-encoded sections, but we aren't going to.  We'll encode them all.
2979        section = 0
2980        extra_chrome = charset + "''"
2981        while value:
2982            chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome)
2983            if maxlen <= chrome_len + 3:
2984                # We need room for the leading blank, the trailing semicolon,
2985                # and at least one character of the value.  If we don't
2986                # have that, we'd be stuck, so in that case fall back to
2987                # the RFC standard width.
2988                maxlen = 78
2989            splitpoint = maxchars = maxlen - chrome_len - 2
2990            while True:
2991                partial = value[:splitpoint]
2992                encoded_value = urllib.parse.quote(
2993                    partial, safe='', errors=error_handler)
2994                if len(encoded_value) <= maxchars:
2995                    break
2996                splitpoint -= 1
2997            lines.append(" {}*{}*={}{}".format(
2998                name, section, extra_chrome, encoded_value))
2999            extra_chrome = ''
3000            section += 1
3001            value = value[splitpoint:]
3002            if value:
3003                lines[-1] += ';'
3004