1#
2# -*- coding: utf-8 -*-
3"""Statement parsing classes for cmd2"""
4
5import re
6import shlex
7from typing import (
8    Any,
9    Dict,
10    Iterable,
11    List,
12    Optional,
13    Tuple,
14    Union,
15)
16
17import attr
18
19from . import (
20    constants,
21    utils,
22)
23from .exceptions import (
24    Cmd2ShlexError,
25)
26
27
28def shlex_split(str_to_split: str) -> List[str]:
29    """
30    A wrapper around shlex.split() that uses cmd2's preferred arguments.
31    This allows other classes to easily call split() the same way StatementParser does.
32
33    :param str_to_split: the string being split
34    :return: A list of tokens
35    """
36    return shlex.split(str_to_split, comments=False, posix=False)
37
38
39@attr.s(auto_attribs=True, frozen=True)
40class MacroArg:
41    """
42    Information used to replace or unescape arguments in a macro value when the macro is resolved
43    Normal argument syntax:    {5}
44    Escaped argument syntax:  {{5}}
45    """
46
47    # The starting index of this argument in the macro value
48    start_index: int = attr.ib(validator=attr.validators.instance_of(int))
49
50    # The number string that appears between the braces
51    # This is a string instead of an int because we support unicode digits and must be able
52    # to reproduce this string later
53    number_str: str = attr.ib(validator=attr.validators.instance_of(str))
54
55    # Tells if this argument is escaped and therefore needs to be unescaped
56    is_escaped: bool = attr.ib(validator=attr.validators.instance_of(bool))
57
58    # Pattern used to find normal argument
59    # Digits surrounded by exactly 1 brace on a side and 1 or more braces on the opposite side
60    # Match strings like: {5}, {{{{{4}, {2}}}}}
61    macro_normal_arg_pattern = re.compile(r'(?<!{){\d+}|{\d+}(?!})')
62
63    # Pattern used to find escaped arguments
64    # Digits surrounded by 2 or more braces on both sides
65    # Match strings like: {{5}}, {{{{{4}}, {{2}}}}}
66    macro_escaped_arg_pattern = re.compile(r'{{2}\d+}{2}')
67
68    # Finds a string of digits
69    digit_pattern = re.compile(r'\d+')
70
71
72@attr.s(auto_attribs=True, frozen=True)
73class Macro:
74    """Defines a cmd2 macro"""
75
76    # Name of the macro
77    name: str = attr.ib(validator=attr.validators.instance_of(str))
78
79    # The string the macro resolves to
80    value: str = attr.ib(validator=attr.validators.instance_of(str))
81
82    # The minimum number of args the user has to pass to this macro
83    minimum_arg_count: int = attr.ib(validator=attr.validators.instance_of(int))
84
85    # Used to fill in argument placeholders in the macro
86    arg_list: List[MacroArg] = attr.ib(default=attr.Factory(list), validator=attr.validators.instance_of(list))
87
88
89@attr.s(auto_attribs=True, frozen=True)
90class Statement(str):  # type: ignore[override]
91    """String subclass with additional attributes to store the results of parsing.
92
93    The ``cmd`` module in the standard library passes commands around as a
94    string. To retain backwards compatibility, ``cmd2`` does the same. However,
95    we need a place to capture the additional output of the command parsing, so
96    we add our own attributes to this subclass.
97
98    Instances of this class should not be created by anything other than the
99    :meth:`cmd2.parsing.StatementParser.parse` method, nor should any of the
100    attributes be modified once the object is created.
101
102    The string portion of the class contains the arguments, but not the
103    command, nor the output redirection clauses.
104
105    Tips:
106
107    1. `argparse <https://docs.python.org/3/library/argparse.html>`_ is your
108       friend for anything complex. ``cmd2`` has the decorator
109       (:func:`~cmd2.decorators.with_argparser`) which you can
110       use to make your command method receive a namespace of parsed arguments,
111       whether positional or denoted with switches.
112
113    2. For commands with simple positional arguments, use
114       :attr:`~cmd2.Statement.args` or :attr:`~cmd2.Statement.arg_list`
115
116    3. If you don't want to have to worry about quoted arguments, see
117       :attr:`argv` for a trick which strips quotes off for you.
118    """
119
120    # the arguments, but not the command, nor the output redirection clauses.
121    args: str = attr.ib(default='', validator=attr.validators.instance_of(str))
122
123    # string containing exactly what we input by the user
124    raw: str = attr.ib(default='', validator=attr.validators.instance_of(str))
125
126    # the command, i.e. the first whitespace delimited word
127    command: str = attr.ib(default='', validator=attr.validators.instance_of(str))
128
129    # list of arguments to the command, not including any output redirection or terminators; quoted args remain quoted
130    arg_list: List[str] = attr.ib(default=attr.Factory(list), validator=attr.validators.instance_of(list))
131
132    # if the command is a multiline command, the name of the command, otherwise empty
133    multiline_command: str = attr.ib(default='', validator=attr.validators.instance_of(str))
134
135    # the character which terminated the multiline command, if there was one
136    terminator: str = attr.ib(default='', validator=attr.validators.instance_of(str))
137
138    # characters appearing after the terminator but before output redirection, if any
139    suffix: str = attr.ib(default='', validator=attr.validators.instance_of(str))
140
141    # if output was piped to a shell command, the shell command as a string
142    pipe_to: str = attr.ib(default='', validator=attr.validators.instance_of(str))
143
144    # if output was redirected, the redirection token, i.e. '>>'
145    output: str = attr.ib(default='', validator=attr.validators.instance_of(str))
146
147    # if output was redirected, the destination file token (quotes preserved)
148    output_to: str = attr.ib(default='', validator=attr.validators.instance_of(str))
149
150    # Used in JSON dictionaries
151    _args_field = 'args'
152
153    def __new__(cls, value: object, *pos_args: Any, **kw_args: Any) -> 'Statement':
154        """Create a new instance of Statement.
155
156        We must override __new__ because we are subclassing `str` which is
157        immutable and takes a different number of arguments as Statement.
158
159        NOTE:  attrs takes care of initializing other members in the __init__ it
160        generates.
161        """
162        stmt = super().__new__(cls, value)
163        return stmt
164
165    @property
166    def command_and_args(self) -> str:
167        """Combine command and args with a space separating them.
168
169        Quoted arguments remain quoted. Output redirection and piping are
170        excluded, as are any command terminators.
171        """
172        if self.command and self.args:
173            rtn = f'{self.command} {self.args}'
174        elif self.command:
175            # there were no arguments to the command
176            rtn = self.command
177        else:
178            rtn = ''
179        return rtn
180
181    @property
182    def post_command(self) -> str:
183        """A string containing any ending terminator, suffix, and redirection chars"""
184        rtn = ''
185        if self.terminator:
186            rtn += self.terminator
187
188        if self.suffix:
189            rtn += ' ' + self.suffix
190
191        if self.pipe_to:
192            rtn += ' | ' + self.pipe_to
193
194        if self.output:
195            rtn += ' ' + self.output
196            if self.output_to:
197                rtn += ' ' + self.output_to
198
199        return rtn
200
201    @property
202    def expanded_command_line(self) -> str:
203        """Concatenate :meth:`~cmd2.Statement.command_and_args`
204        and :meth:`~cmd2.Statement.post_command`"""
205        return self.command_and_args + self.post_command
206
207    @property
208    def argv(self) -> List[str]:
209        """a list of arguments a-la ``sys.argv``.
210
211        The first element of the list is the command after shortcut and macro
212        expansion. Subsequent elements of the list contain any additional
213        arguments, with quotes removed, just like bash would. This is very
214        useful if you are going to use ``argparse.parse_args()``.
215
216        If you want to strip quotes from the input, you can use ``argv[1:]``.
217        """
218        if self.command:
219            rtn = [utils.strip_quotes(self.command)]
220            for cur_token in self.arg_list:
221                rtn.append(utils.strip_quotes(cur_token))
222        else:
223            rtn = []
224
225        return rtn
226
227    def to_dict(self) -> Dict[str, Any]:
228        """Utility method to convert this Statement into a dictionary for use in persistent JSON history files"""
229        return self.__dict__.copy()
230
231    @staticmethod
232    def from_dict(source_dict: Dict[str, Any]) -> 'Statement':
233        """
234        Utility method to restore a Statement from a dictionary
235
236        :param source_dict: source data dictionary (generated using to_dict())
237        :return: Statement object
238        :raises KeyError: if source_dict is missing required elements
239        """
240        # value needs to be passed as a positional argument. It corresponds to the args field.
241        try:
242            value = source_dict[Statement._args_field]
243        except KeyError as ex:
244            raise KeyError(f"Statement dictionary is missing {ex} field")
245
246        # Pass the rest at kwargs (minus args)
247        kwargs = source_dict.copy()
248        del kwargs[Statement._args_field]
249
250        return Statement(value, **kwargs)
251
252
253class StatementParser:
254    """Parse user input as a string into discrete command components."""
255
256    def __init__(
257        self,
258        terminators: Optional[Iterable[str]] = None,
259        multiline_commands: Optional[Iterable[str]] = None,
260        aliases: Optional[Dict[str, str]] = None,
261        shortcuts: Optional[Dict[str, str]] = None,
262    ) -> None:
263        """Initialize an instance of StatementParser.
264
265        The following will get converted to an immutable tuple before storing internally:
266        terminators, multiline commands, and shortcuts.
267
268        :param terminators: iterable containing strings which should terminate commands
269        :param multiline_commands: iterable containing the names of commands that accept multiline input
270        :param aliases: dictionary containing aliases
271        :param shortcuts: dictionary containing shortcuts
272        """
273        self.terminators: Tuple[str, ...]
274        if terminators is None:
275            self.terminators = (constants.MULTILINE_TERMINATOR,)
276        else:
277            self.terminators = tuple(terminators)
278        self.multiline_commands: Tuple[str, ...] = tuple(multiline_commands) if multiline_commands is not None else ()
279        self.aliases: Dict[str, str] = aliases if aliases is not None else {}
280
281        if shortcuts is None:
282            shortcuts = constants.DEFAULT_SHORTCUTS
283
284        # Sort the shortcuts in descending order by name length because the longest match
285        # should take precedence. (e.g., @@file should match '@@' and not '@'.
286        self.shortcuts = tuple(sorted(shortcuts.items(), key=lambda x: len(x[0]), reverse=True))
287
288        # commands have to be a word, so make a regular expression
289        # that matches the first word in the line. This regex has three
290        # parts:
291        #     - the '\A\s*' matches the beginning of the string (even
292        #       if contains multiple lines) and gobbles up any leading
293        #       whitespace
294        #     - the first parenthesis enclosed group matches one
295        #       or more non-whitespace characters with a non-greedy match
296        #       (that's what the '+?' part does). The non-greedy match
297        #       ensures that this first group doesn't include anything
298        #       matched by the second group
299        #     - the second parenthesis group must be dynamically created
300        #       because it needs to match either whitespace, something in
301        #       REDIRECTION_CHARS, one of the terminators, or the end of
302        #       the string (\Z matches the end of the string even if it
303        #       contains multiple lines)
304        #
305        invalid_command_chars = []
306        invalid_command_chars.extend(constants.QUOTES)
307        invalid_command_chars.extend(constants.REDIRECTION_CHARS)
308        invalid_command_chars.extend(self.terminators)
309        # escape each item so it will for sure get treated as a literal
310        second_group_items = [re.escape(x) for x in invalid_command_chars]
311        # add the whitespace and end of string, not escaped because they
312        # are not literals
313        second_group_items.extend([r'\s', r'\Z'])
314        # join them up with a pipe
315        second_group = '|'.join(second_group_items)
316        # build the regular expression
317        expr = rf'\A\s*(\S*?)({second_group})'
318        self._command_pattern = re.compile(expr)
319
320    def is_valid_command(self, word: str, *, is_subcommand: bool = False) -> Tuple[bool, str]:
321        """Determine whether a word is a valid name for a command.
322
323        Commands cannot include redirection characters, whitespace,
324        or termination characters. They also cannot start with a
325        shortcut.
326
327        :param word: the word to check as a command
328        :param is_subcommand: Flag whether this command name is a subcommand name
329        :return: a tuple of a boolean and an error string
330
331        If word is not a valid command, return ``False`` and an error string
332        suitable for inclusion in an error message of your choice::
333
334            checkit = '>'
335            valid, errmsg = statement_parser.is_valid_command(checkit)
336            if not valid:
337                errmsg = f"alias: {errmsg}"
338        """
339        valid = False
340
341        if not isinstance(word, str):
342            return False, f'must be a string. Received {str(type(word))} instead'  # type: ignore[unreachable]
343
344        if not word:
345            return False, 'cannot be an empty string'
346
347        if word.startswith(constants.COMMENT_CHAR):
348            return False, 'cannot start with the comment character'
349
350        if not is_subcommand:
351            for (shortcut, _) in self.shortcuts:
352                if word.startswith(shortcut):
353                    # Build an error string with all shortcuts listed
354                    errmsg = 'cannot start with a shortcut: '
355                    errmsg += ', '.join(shortcut for (shortcut, _) in self.shortcuts)
356                    return False, errmsg
357
358        errmsg = 'cannot contain: whitespace, quotes, '
359        errchars = []
360        errchars.extend(constants.REDIRECTION_CHARS)
361        errchars.extend(self.terminators)
362        errmsg += ', '.join([shlex.quote(x) for x in errchars])
363
364        match = self._command_pattern.search(word)
365        if match:
366            if word == match.group(1):
367                valid = True
368                errmsg = ''
369        return valid, errmsg
370
371    def tokenize(self, line: str) -> List[str]:
372        """
373        Lex a string into a list of tokens. Shortcuts and aliases are expanded and
374        comments are removed.
375
376        :param line: the command line being lexed
377        :return: A list of tokens
378        :raises: Cmd2ShlexError if a shlex error occurs (e.g. No closing quotation)
379        """
380
381        # expand shortcuts and aliases
382        line = self._expand(line)
383
384        # check if this line is a comment
385        if line.lstrip().startswith(constants.COMMENT_CHAR):
386            return []
387
388        # split on whitespace
389        try:
390            tokens = shlex_split(line)
391        except ValueError as ex:
392            raise Cmd2ShlexError(ex)
393
394        # custom lexing
395        tokens = self.split_on_punctuation(tokens)
396        return tokens
397
398    def parse(self, line: str) -> Statement:
399        """
400        Tokenize the input and parse it into a :class:`~cmd2.Statement` object,
401        stripping comments, expanding aliases and shortcuts, and extracting output
402        redirection directives.
403
404        :param line: the command line being parsed
405        :return: a new :class:`~cmd2.Statement` object
406        :raises: Cmd2ShlexError if a shlex error occurs (e.g. No closing quotation)
407        """
408
409        # handle the special case/hardcoded terminator of a blank line
410        # we have to do this before we tokenize because tokenizing
411        # destroys all unquoted whitespace in the input
412        terminator = ''
413        if line[-1:] == constants.LINE_FEED:
414            terminator = constants.LINE_FEED
415
416        command = ''
417        args = ''
418        arg_list = []
419
420        # lex the input into a list of tokens
421        tokens = self.tokenize(line)
422
423        # of the valid terminators, find the first one to occur in the input
424        terminator_pos = len(tokens) + 1
425        for pos, cur_token in enumerate(tokens):
426            for test_terminator in self.terminators:
427                if cur_token.startswith(test_terminator):
428                    terminator_pos = pos
429                    terminator = test_terminator
430                    # break the inner loop, and we want to break the
431                    # outer loop too
432                    break
433            else:
434                # this else clause is only run if the inner loop
435                # didn't execute a break. If it didn't, then
436                # continue to the next iteration of the outer loop
437                continue
438            # inner loop was broken, break the outer
439            break
440
441        if terminator:
442            if terminator == constants.LINE_FEED:
443                terminator_pos = len(tokens) + 1
444
445            # everything before the first terminator is the command and the args
446            (command, args) = self._command_and_args(tokens[:terminator_pos])
447            arg_list = tokens[1:terminator_pos]
448            # we will set the suffix later
449            # remove all the tokens before and including the terminator
450            tokens = tokens[terminator_pos + 1 :]
451        else:
452            (testcommand, testargs) = self._command_and_args(tokens)
453            if testcommand in self.multiline_commands:
454                # no terminator on this line but we have a multiline command
455                # everything else on the line is part of the args
456                # because redirectors can only be after a terminator
457                command = testcommand
458                args = testargs
459                arg_list = tokens[1:]
460                tokens = []
461
462        pipe_to = ''
463        output = ''
464        output_to = ''
465
466        # Find which redirector character appears first in the command
467        try:
468            pipe_index = tokens.index(constants.REDIRECTION_PIPE)
469        except ValueError:
470            pipe_index = len(tokens)
471
472        try:
473            redir_index = tokens.index(constants.REDIRECTION_OUTPUT)
474        except ValueError:
475            redir_index = len(tokens)
476
477        try:
478            append_index = tokens.index(constants.REDIRECTION_APPEND)
479        except ValueError:
480            append_index = len(tokens)
481
482        # Check if output should be piped to a shell command
483        if pipe_index < redir_index and pipe_index < append_index:
484
485            # Get the tokens for the pipe command and expand ~ where needed
486            pipe_to_tokens = tokens[pipe_index + 1 :]
487            utils.expand_user_in_tokens(pipe_to_tokens)
488
489            # Build the pipe command line string
490            pipe_to = ' '.join(pipe_to_tokens)
491
492            # remove all the tokens after the pipe
493            tokens = tokens[:pipe_index]
494
495        # Check for output redirect/append
496        elif redir_index != append_index:
497            if redir_index < append_index:
498                output = constants.REDIRECTION_OUTPUT
499                output_index = redir_index
500            else:
501                output = constants.REDIRECTION_APPEND
502                output_index = append_index
503
504            # Check if we are redirecting to a file
505            if len(tokens) > output_index + 1:
506                unquoted_path = utils.strip_quotes(tokens[output_index + 1])
507                if unquoted_path:
508                    output_to = utils.expand_user(tokens[output_index + 1])
509
510            # remove all the tokens after the output redirect
511            tokens = tokens[:output_index]
512
513        if terminator:
514            # whatever is left is the suffix
515            suffix = ' '.join(tokens)
516        else:
517            # no terminator, so whatever is left is the command and the args
518            suffix = ''
519            if not command:
520                # command could already have been set, if so, don't set it again
521                (command, args) = self._command_and_args(tokens)
522                arg_list = tokens[1:]
523
524        # set multiline
525        if command in self.multiline_commands:
526            multiline_command = command
527        else:
528            multiline_command = ''
529
530        # build the statement
531        statement = Statement(
532            args,
533            raw=line,
534            command=command,
535            arg_list=arg_list,
536            multiline_command=multiline_command,
537            terminator=terminator,
538            suffix=suffix,
539            pipe_to=pipe_to,
540            output=output,
541            output_to=output_to,
542        )
543        return statement
544
545    def parse_command_only(self, rawinput: str) -> Statement:
546        """Partially parse input into a :class:`~cmd2.Statement` object.
547
548        The command is identified, and shortcuts and aliases are expanded.
549        Multiline commands are identified, but terminators and output
550        redirection are not parsed.
551
552        This method is used by tab completion code and therefore must not
553        generate an exception if there are unclosed quotes.
554
555        The :class:`~cmd2.Statement` object returned by this method can at most
556        contain values in the following attributes:
557        :attr:`~cmd2.Statement.args`, :attr:`~cmd2.Statement.raw`,
558        :attr:`~cmd2.Statement.command`,
559        :attr:`~cmd2.Statement.multiline_command`
560
561        :attr:`~cmd2.Statement.args` will include all output redirection
562        clauses and command terminators.
563
564        Different from :meth:`~cmd2.parsing.StatementParser.parse` this method
565        does not remove redundant whitespace within args. However, it does
566        ensure args has no leading or trailing whitespace.
567
568        :param rawinput: the command line as entered by the user
569        :return: a new :class:`~cmd2.Statement` object
570        """
571        # expand shortcuts and aliases
572        line = self._expand(rawinput)
573
574        command = ''
575        args = ''
576        match = self._command_pattern.search(line)
577        if match:
578            # we got a match, extract the command
579            command = match.group(1)
580
581            # take everything from the end of the first match group to
582            # the end of the line as the arguments (stripping leading
583            # and trailing spaces)
584            args = line[match.end(1) :].strip()
585            # if the command is empty that means the input was either empty
586            # or something weird like '>'. args should be empty if we couldn't
587            # parse a command
588            if not command or not args:
589                args = ''
590
591        # set multiline
592        if command in self.multiline_commands:
593            multiline_command = command
594        else:
595            multiline_command = ''
596
597        # build the statement
598        statement = Statement(args, raw=rawinput, command=command, multiline_command=multiline_command)
599        return statement
600
601    def get_command_arg_list(
602        self, command_name: str, to_parse: Union[Statement, str], preserve_quotes: bool
603    ) -> Tuple[Statement, List[str]]:
604        """
605        Convenience method used by the argument parsing decorators.
606
607        Retrieves just the arguments being passed to their ``do_*`` methods as a list.
608
609        :param command_name: name of the command being run
610        :param to_parse: what is being passed to the ``do_*`` method. It can be one of two types:
611
612                             1. An already parsed :class:`~cmd2.Statement`
613                             2. An argument string in cases where a ``do_*`` method is
614                                explicitly called. Calling ``do_help('alias create')`` would
615                                cause ``to_parse`` to be 'alias create'.
616
617                                In this case, the string will be converted to a
618                                :class:`~cmd2.Statement` and returned along with
619                                the argument list.
620
621        :param preserve_quotes: if ``True``, then quotes will not be stripped from
622                                the arguments
623        :return: A tuple containing the :class:`~cmd2.Statement` and a list of
624                 strings representing the arguments
625        """
626        # Check if to_parse needs to be converted to a Statement
627        if not isinstance(to_parse, Statement):
628            to_parse = self.parse(command_name + ' ' + to_parse)
629
630        if preserve_quotes:
631            return to_parse, to_parse.arg_list
632        else:
633            return to_parse, to_parse.argv[1:]
634
635    def _expand(self, line: str) -> str:
636        """Expand aliases and shortcuts"""
637
638        # Make a copy of aliases so we can keep track of what aliases have been resolved to avoid an infinite loop
639        remaining_aliases = list(self.aliases.keys())
640        keep_expanding = bool(remaining_aliases)
641
642        while keep_expanding:
643            keep_expanding = False
644
645            # apply our regex to line
646            match = self._command_pattern.search(line)
647            if match:
648                # we got a match, extract the command
649                command = match.group(1)
650
651                # Check if this command matches an alias that wasn't already processed
652                if command in remaining_aliases:
653                    # rebuild line with the expanded alias
654                    line = self.aliases[command] + match.group(2) + line[match.end(2) :]
655                    remaining_aliases.remove(command)
656                    keep_expanding = bool(remaining_aliases)
657
658        # expand shortcuts
659        for (shortcut, expansion) in self.shortcuts:
660            if line.startswith(shortcut):
661                # If the next character after the shortcut isn't a space, then insert one
662                shortcut_len = len(shortcut)
663                if len(line) == shortcut_len or line[shortcut_len] != ' ':
664                    expansion += ' '
665
666                # Expand the shortcut
667                line = line.replace(shortcut, expansion, 1)
668                break
669        return line
670
671    @staticmethod
672    def _command_and_args(tokens: List[str]) -> Tuple[str, str]:
673        """Given a list of tokens, return a tuple of the command
674        and the args as a string.
675        """
676        command = ''
677        args = ''
678
679        if tokens:
680            command = tokens[0]
681
682        if len(tokens) > 1:
683            args = ' '.join(tokens[1:])
684
685        return command, args
686
687    def split_on_punctuation(self, tokens: List[str]) -> List[str]:
688        """Further splits tokens from a command line using punctuation characters.
689
690        Punctuation characters are treated as word breaks when they are in
691        unquoted strings. Each run of punctuation characters is treated as a
692        single token.
693
694        :param tokens: the tokens as parsed by shlex
695        :return: a new list of tokens, further split using punctuation
696        """
697        punctuation: List[str] = []
698        punctuation.extend(self.terminators)
699        punctuation.extend(constants.REDIRECTION_CHARS)
700
701        punctuated_tokens = []
702
703        for cur_initial_token in tokens:
704
705            # Save tokens up to 1 character in length or quoted tokens. No need to parse these.
706            if len(cur_initial_token) <= 1 or cur_initial_token[0] in constants.QUOTES:
707                punctuated_tokens.append(cur_initial_token)
708                continue
709
710            # Iterate over each character in this token
711            cur_index = 0
712            cur_char = cur_initial_token[cur_index]
713
714            # Keep track of the token we are building
715            new_token = ''
716
717            while True:
718                if cur_char not in punctuation:
719
720                    # Keep appending to new_token until we hit a punctuation char
721                    while cur_char not in punctuation:
722                        new_token += cur_char
723                        cur_index += 1
724                        if cur_index < len(cur_initial_token):
725                            cur_char = cur_initial_token[cur_index]
726                        else:
727                            break
728
729                else:
730                    cur_punc = cur_char
731
732                    # Keep appending to new_token until we hit something other than cur_punc
733                    while cur_char == cur_punc:
734                        new_token += cur_char
735                        cur_index += 1
736                        if cur_index < len(cur_initial_token):
737                            cur_char = cur_initial_token[cur_index]
738                        else:
739                            break
740
741                # Save the new token
742                punctuated_tokens.append(new_token)
743                new_token = ''
744
745                # Check if we've viewed all characters
746                if cur_index >= len(cur_initial_token):
747                    break
748
749        return punctuated_tokens
750