Source code for cli.lexer

"""A module with Lexer responsibility.

Lexing is a common step in language
compilation or interpreting. A lexem
is a group of tokens of the input
stream, grouped by some "meaning".
"""
import enum
import logging

from cli.exceptions import LexException


@enum.unique
[docs]class LexemType(enum.Enum): """Possible types of Lexems that Lexer produces. These are possible lexemes: - ``QUOTED_STRING`` is a sequence of characters inside double or single quotes; - ``STRING`` is a non-space sequence of characters; - ``ASSIGNMENT`` is a string of the form "smth=smth_other" (without quotes), `smth_other` can be empty; - ``PIPE`` is a `|` symbol. """ QUOTED_STRING = 1 STRING = 2 ASSIGNMENT = 3 PIPE = 4
[docs]class Lexem: """A single lexem. A Lexem provides interface for querying it's position in the string (for producing meaningful error messages), it's type and for getting it's string representation. """ def __init__(self, tp, val, start_idx, end_idx): """Create a lexem out of type, string, start and end indices. Args: tp (:class:`.LexemType`): a type of lexem; val (str): a string representation of this lexem; start_idx (int): starting index, zero-based; end_idx (int): ending index, zero-based. """ self._tp = tp self._val = val self._start_idx = start_idx self._end_idx = end_idx
[docs] def get_value(self): """Return string representation of this lexem. Quotes are stripped from QUOTED_STRING. """ if self.get_type() == LexemType.QUOTED_STRING: return self._val[1:-1] return self._val
[docs] def get_type(self): """Return type of this lexem. """ return self._tp
[docs] def get_position(self): """Return string representation of the position. For example, if ``self._start_idx = 1`` and ``self.end_idx = 5``, then this function will return ``(1:5)``. """ return '({}:{})'.format(self._start_idx, self._end_idx)
[docs]class Lexer: """A static class for lexing a preprocessed string. Given a string, we want to split it into meaningful (more or less) tokens. Possible tokens are described in :class:`.LexemType` docstring. """ @staticmethod
[docs] def get_lexemes(raw_str): """Scan the string left-to-right, output list of lexemes. Args: raw_str (str): a string to lex. Returns: list[:class:`.Lexem`] -- the resulting lexemes. Raises: :class:`exceptions.LexException`: if some quoted string started but never ends. """ lexem_list = [] unprocessed_part = raw_str.lstrip() orig_size = len(raw_str) while unprocessed_part: start_index = orig_size - len(unprocessed_part) next_lexem, unprocessed_part = Lexer._get_first_lexem(unprocessed_part, start_index) unprocessed_part = unprocessed_part.lstrip() lexem_list.append(next_lexem) logging.debug('Lexer: {} was lexed '\ 'to {}'.format(raw_str, ','.join(map(lambda lex: lex.get_type().name, lexem_list)))) return lexem_list
@staticmethod def _get_first_lexem(raw_str, start_idx_in_original): """Get the first lexem of the string. Args: raw_str (str): the string to be anaylzed; start_idx_in_original (int): an index of this string in an original string (of which this string is a part). Invariant: string is not empty, it does not start with whitespace. """ raw_len = len(raw_str) if raw_str[0] == '|': return (Lexem(LexemType.PIPE, '|', start_idx_in_original, start_idx_in_original), raw_str[1:]) elif raw_str[0] in ('"', "'"): next_quote_idx = 1 while next_quote_idx < raw_len: if raw_str[next_quote_idx] == raw_str[0]: break next_quote_idx += 1 if next_quote_idx == raw_len: raise LexException('A non-terminating quoted string starting '\ 'at position {}'.format(start_idx_in_original)) return (Lexem(LexemType.QUOTED_STRING, raw_str[:next_quote_idx + 1], start_idx_in_original, start_idx_in_original + next_quote_idx), raw_str[next_quote_idx + 1:]) else: whitespace_or_quote_idx = 1 while whitespace_or_quote_idx < raw_len: cur_char = raw_str[whitespace_or_quote_idx] if cur_char.isspace() or cur_char in ('"', "'"): break whitespace_or_quote_idx += 1 lexem_val = raw_str[:whitespace_or_quote_idx] rest_string = raw_str[whitespace_or_quote_idx:] if '=' in lexem_val: return (Lexem(LexemType.ASSIGNMENT, lexem_val, start_idx_in_original, start_idx_in_original + whitespace_or_quote_idx - 1), rest_string) return (Lexem(LexemType.STRING, lexem_val, start_idx_in_original, start_idx_in_original + whitespace_or_quote_idx - 1), rest_string)