Source code for cli.lexer

"""A module with Lexer responsibility.

Lexing is a common step in language
compilation or interpreting. A lexem
is a group of tokens of the input
stream, grouped by some "meaning".
"""
import enum
import logging

from cli.exceptions import LexException


@enum.unique
[docs]class LexemType(enum.Enum):
    """Possible types of Lexems that Lexer produces.

    These are possible lexemes:

       - ``QUOTED_STRING`` is a sequence of characters inside double
            or single quotes;
       - ``STRING`` is a non-space sequence of characters;
       - ``ASSIGNMENT`` is a string of the form "smth=smth_other" (without quotes),
            `smth_other` can be empty;
       - ``PIPE`` is a `|` symbol.

    """

    QUOTED_STRING = 1
    STRING = 2
    ASSIGNMENT = 3
    PIPE = 4


[docs]class Lexem:
    """A single lexem.

    A Lexem provides interface for querying it's
    position in the string (for producing meaningful error
    messages), it's type and for getting it's string representation.
    """

    def __init__(self, tp, val, start_idx, end_idx):
        """Create a lexem out of type, string, start and end indices.

        Args:
            tp (:class:`.LexemType`): a type of lexem;
            val (str): a string representation of this lexem;
            start_idx (int): starting index, zero-based;
            end_idx (int): ending index, zero-based.
        """
        self._tp = tp
        self._val = val
        self._start_idx = start_idx
        self._end_idx = end_idx

[docs]    def get_value(self):
        """Return string representation of this lexem.

        Quotes are stripped from QUOTED_STRING.
        """
        if self.get_type() == LexemType.QUOTED_STRING:
            return self._val[1:-1]

        return self._val

[docs]    def get_type(self):
        """Return type of this lexem.
        """
        return self._tp

[docs]    def get_position(self):
        """Return string representation of the position.

        For example, if ``self._start_idx = 1`` and ``self.end_idx = 5``,
        then this function will return ``(1:5)``.
        """
        return '({}:{})'.format(self._start_idx, self._end_idx)


[docs]class Lexer:
    """A static class for lexing a preprocessed string.

    Given a string, we want to split it into meaningful
    (more or less) tokens.
    Possible tokens are described in :class:`.LexemType` docstring.
    """

    @staticmethod
[docs]    def get_lexemes(raw_str):
        """Scan the string left-to-right, output list of lexemes.

        Args:
            raw_str (str): a string to lex.

        Returns:
            list[:class:`.Lexem`] -- the resulting lexemes.

        Raises:
            :class:`exceptions.LexException`: if some quoted string started but never ends.
        """
        lexem_list = []
        unprocessed_part = raw_str.lstrip()
        orig_size = len(raw_str)

        while unprocessed_part:
            start_index = orig_size - len(unprocessed_part)
            next_lexem, unprocessed_part = Lexer._get_first_lexem(unprocessed_part,
                                                                  start_index)
            unprocessed_part = unprocessed_part.lstrip()
            lexem_list.append(next_lexem)

        logging.debug('Lexer: {} was lexed '\
                      'to {}'.format(raw_str,
                                     ','.join(map(lambda lex: lex.get_type().name, lexem_list))))
        return lexem_list

    @staticmethod
    def _get_first_lexem(raw_str, start_idx_in_original):
        """Get the first lexem of the string.

        Args:
            raw_str (str): the string to be anaylzed;
            start_idx_in_original (int): an index of this
                string in an original string
                (of which this string is a part).

        Invariant: string is not empty, it does not start with whitespace.
        """
        raw_len = len(raw_str)

        if raw_str[0] == '|':
            return (Lexem(LexemType.PIPE, '|', start_idx_in_original,
                          start_idx_in_original),
                    raw_str[1:])
        elif raw_str[0] in ('"', "'"):
            next_quote_idx = 1

            while next_quote_idx < raw_len:
                if raw_str[next_quote_idx] == raw_str[0]:
                    break
                next_quote_idx += 1

            if next_quote_idx == raw_len:
                raise LexException('A non-terminating quoted string starting '\
                                   'at position {}'.format(start_idx_in_original))

            return (Lexem(LexemType.QUOTED_STRING, raw_str[:next_quote_idx + 1],
                          start_idx_in_original,
                          start_idx_in_original + next_quote_idx),
                    raw_str[next_quote_idx + 1:])
        else:
            whitespace_or_quote_idx = 1

            while whitespace_or_quote_idx < raw_len:
                cur_char = raw_str[whitespace_or_quote_idx]
                if cur_char.isspace() or cur_char in ('"', "'"):
                    break
                whitespace_or_quote_idx += 1

            lexem_val = raw_str[:whitespace_or_quote_idx]
            rest_string = raw_str[whitespace_or_quote_idx:]

            if '=' in lexem_val:
                return (Lexem(LexemType.ASSIGNMENT, lexem_val,
                              start_idx_in_original,
                              start_idx_in_original + whitespace_or_quote_idx - 1),
                        rest_string)

            return (Lexem(LexemType.STRING, lexem_val,
                          start_idx_in_original,
                          start_idx_in_original + whitespace_or_quote_idx - 1),
                    rest_string)