diff --git a/parser/internal/BUILD b/parser/internal/BUILD index af815588e..f52b88652 100644 --- a/parser/internal/BUILD +++ b/parser/internal/BUILD @@ -13,6 +13,7 @@ # limitations under the License. load("@rules_cc//cc:cc_library.bzl", "cc_library") +load("@rules_cc//cc:cc_test.bzl", "cc_test") load("//bazel:antlr.bzl", "antlr_cc_library") package(default_visibility = ["//visibility:public"]) @@ -29,3 +30,29 @@ antlr_cc_library( src = "Cel.g4", package = "cel_parser_internal", ) + +cc_library( + name = "lexer", + srcs = ["lexer.cc"], + hdrs = ["lexer.h"], + deps = [ + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:no_destructor", + "@com_google_absl//absl/base:nullability", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/functional:function_ref", + "@com_google_absl//absl/log:absl_check", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:string_view", + ], +) + +cc_test( + name = "lexer_test", + srcs = ["lexer_test.cc"], + deps = [ + ":lexer", + "//internal:testing", + "@com_google_absl//absl/strings:string_view", + ], +) diff --git a/parser/internal/lexer.cc b/parser/internal/lexer.cc new file mode 100644 index 000000000..2dca4d5c2 --- /dev/null +++ b/parser/internal/lexer.cc @@ -0,0 +1,732 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "parser/internal/lexer.h" + +#include +#include +#include +#include +#include +#include + +#include "absl/base/attributes.h" +#include "absl/base/optimization.h" +#include "absl/functional/function_ref.h" +#include "absl/log/absl_check.h" +#include "absl/strings/ascii.h" +#include "absl/strings/match.h" +#include "absl/strings/str_cat.h" + +namespace cel_parser_internal { + +namespace { + +bool IsIdentTrailing(unsigned char c) { + return absl::ascii_isdigit(c) || absl::ascii_isalpha(c) || c == '_'; +} + +bool IsBinaryDigit(unsigned char c) { return c == '0' || c == '1'; } + +bool IsPlusOrMinus(unsigned char c) { return c == '+' || c == '-'; } + +std::string_view TokenNumericLiteralName(TokenType type) { + switch (type) { + case TokenType::kInt: + return "int"; + case TokenType::kUint: + return "uint"; + case TokenType::kFloat: + return "float"; + default: + return ""; + } +} + +} // namespace + +std::string_view TokenTypeToString(TokenType type) { + switch (type) { + case TokenType::kError: + return "error"; + case TokenType::kEnd: + return "end"; + case TokenType::kWhitespace: + return "whitespace"; + case TokenType::kComment: + return "comment"; + case TokenType::kNull: + return "null"; + case TokenType::kFalse: + return "false"; + case TokenType::kTrue: + return "true"; + case TokenType::kIn: + return "in"; + case TokenType::kInt: + return "int"; + case TokenType::kUint: + return "uint"; + case TokenType::kFloat: + return "float"; + case TokenType::kString: + return "string"; + case TokenType::kBytes: + return "bytes"; + case TokenType::kIdent: + return "ident"; + case TokenType::kLeftBracket: + return "["; + case TokenType::kRightBracket: + return "]"; + case TokenType::kLeftBrace: + return "{"; + case TokenType::kRightBrace: + return "}"; + case TokenType::kLeftParen: + return "("; + case TokenType::kRightParen: + return ")"; + case TokenType::kDot: + return "."; + case TokenType::kComma: + return ","; + case TokenType::kMinus: + return "-"; + case TokenType::kPlus: + return "+"; + case TokenType::kAsterisk: + return "*"; + case TokenType::kSlash: + return "/"; + case TokenType::kPercent: + return "%"; + case TokenType::kQuestion: + return "?"; + case TokenType::kColon: + return ":"; + case TokenType::kExclamation: + return "!"; + case TokenType::kEqual: + return "="; + case TokenType::kEqualEqual: + return "=="; + case TokenType::kExclamationEqual: + return "!="; + case TokenType::kLess: + return "<"; + case TokenType::kLessEqual: + return "<="; + case TokenType::kGreater: + return ">"; + case TokenType::kGreaterEqual: + return ">="; + case TokenType::kLogicalAnd: + return "&&"; + case TokenType::kLogicalOr: + return "||"; + default: + return ""; + } +} + +Token Lexer::Lex() { + if (ABSL_PREDICT_FALSE(at_error_)) { + return MakeToken(TokenType::kError, error_.start, error_.end); + } + int32_t start = GetPosition(); + if (ABSL_PREDICT_FALSE(text_ == text_end_)) { + at_end_ = true; + done_ = true; + return MakeToken(TokenType::kEnd, start, start); + } + char c = *text_; + switch (c) { + case '\v': + ABSL_FALLTHROUGH_INTENDED; + case '\t': + ABSL_FALLTHROUGH_INTENDED; + case '\r': + ABSL_FALLTHROUGH_INTENDED; + case '\n': + ABSL_FALLTHROUGH_INTENDED; + case ' ': { + static_cast(ConsumeWhitespace()); + return MakeToken(TokenType::kWhitespace, start, GetPosition()); + } + case '.': { + if (text_ + 1 < text_end_ && absl::ascii_isdigit(text_[1])) { + break; + } + Advance(1); + return MakeToken(TokenType::kDot, start, GetPosition()); + } + case ',': { + Advance(1); + return MakeToken(TokenType::kComma, start, GetPosition()); + } + case '!': { + Advance(1); + if (Consume('=')) { + return MakeToken(TokenType::kExclamationEqual, start, GetPosition()); + } + return MakeToken(TokenType::kExclamation, start, GetPosition()); + } + case '?': { + Advance(1); + return MakeToken(TokenType::kQuestion, start, GetPosition()); + } + case '(': { + Advance(1); + return MakeToken(TokenType::kLeftParen, start, GetPosition()); + } + case ')': { + Advance(1); + return MakeToken(TokenType::kRightParen, start, GetPosition()); + } + case '{': { + Advance(1); + return MakeToken(TokenType::kLeftBrace, start, GetPosition()); + } + case '}': { + Advance(1); + return MakeToken(TokenType::kRightBrace, start, GetPosition()); + } + case '[': { + Advance(1); + return MakeToken(TokenType::kLeftBracket, start, GetPosition()); + } + case ']': { + Advance(1); + return MakeToken(TokenType::kRightBracket, start, GetPosition()); + } + case '=': { + Advance(1); + if (Consume('=')) { + return MakeToken(TokenType::kEqualEqual, start, GetPosition()); + } + return MakeToken(TokenType::kEqual, start, GetPosition()); + } + case '<': { + Advance(1); + if (Consume('=')) { + return MakeToken(TokenType::kLessEqual, start, GetPosition()); + } + return MakeToken(TokenType::kLess, start, GetPosition()); + } + case '>': { + Advance(1); + if (Consume('=')) { + return MakeToken(TokenType::kGreaterEqual, start, GetPosition()); + } + return MakeToken(TokenType::kGreater, start, GetPosition()); + } + case ':': { + Advance(1); + return MakeToken(TokenType::kColon, start, GetPosition()); + } + case '%': { + Advance(1); + return MakeToken(TokenType::kPercent, start, GetPosition()); + } + case '+': { + Advance(1); + return MakeToken(TokenType::kPlus, start, GetPosition()); + } + case '-': { + Advance(1); + return MakeToken(TokenType::kMinus, start, GetPosition()); + } + case '*': { + Advance(1); + return MakeToken(TokenType::kAsterisk, start, GetPosition()); + } + case '/': { + Advance(1); + if (Consume('/')) { + ConsumeLine(); + return MakeToken(TokenType::kComment, start, GetPosition()); + } + return MakeToken(TokenType::kSlash, start, GetPosition()); + } + case '&': { + Advance(1); + if (Consume('&')) { + return MakeToken(TokenType::kLogicalAnd, start, GetPosition()); + } + return SetError(start, GetPosition(), + "unexpected single '&', expected '&&'"); + } + case '|': { + Advance(1); + if (Consume('|')) { + return MakeToken(TokenType::kLogicalOr, start, GetPosition()); + } + return SetError(start, GetPosition(), + "unexpected single '|', expected '||'"); + } + case '`': { + Advance(1); + if (!ConsumeUntilAfter('`')) { + return SetError(start, GetPosition(), "unterminated quoted identifier"); + } + return MakeToken(TokenType::kIdent, start, GetPosition()); + } + case '\'': { + Advance(1); + if (ConsumeString("''")) { + if (!ConsumeUntilAfterString("'''")) { + return SetError(start, GetPosition(), "unterminated string literal"); + } + return MakeToken(TokenType::kString, start, GetPosition()); + } + if (!ConsumeUntilAfterUnescaped('\'')) { + return SetError(start, GetPosition(), "unterminated string literal"); + } + return MakeToken(TokenType::kString, start, GetPosition()); + } + case '"': { + Advance(1); + if (ConsumeString("\"\"")) { + if (!ConsumeUntilAfterString("\"\"\"")) { + return SetError(start, GetPosition(), "unterminated string literal"); + } + return MakeToken(TokenType::kString, start, GetPosition()); + } + if (!ConsumeUntilAfterUnescaped('"')) { + return SetError(start, GetPosition(), "unterminated string literal"); + } + return MakeToken(TokenType::kString, start, GetPosition()); + } + default: + break; + } + if (c == 'r' || c == 'R' || c == 'b' || c == 'B') { + bool is_bytes = (c == 'b' || c == 'B'); + size_t lookahead = 1; + if (text_ + 1 < text_end_) { + char c2 = text_[1]; + if ((is_bytes && (c2 == 'r' || c2 == 'R')) || + (!is_bytes && (c2 == 'b' || c2 == 'B'))) { + is_bytes = true; + lookahead = 2; + } + } + if (text_ + lookahead < text_end_) { + char quote = text_[lookahead]; + if (quote == '"' || quote == '\'') { + Advance(lookahead + 1); + std::string tripe_quote(3, quote); + if (ConsumeString(std::string_view(tripe_quote.data(), 2))) { + if (!ConsumeUntilAfterString(tripe_quote)) { + return SetError(start, GetPosition(), + is_bytes ? "unterminated bytes literal" + : "unterminated string literal"); + } + return MakeToken(is_bytes ? TokenType::kBytes : TokenType::kString, + start, GetPosition()); + } + if (!ConsumeUntilAfterUnescaped(quote)) { + return SetError(start, GetPosition(), + is_bytes ? "unterminated bytes literal" + : "unterminated string literal"); + } + return MakeToken(is_bytes ? TokenType::kBytes : TokenType::kString, + start, GetPosition()); + } + } + } + if (c == '.' || absl::ascii_isdigit(c)) { + bool floating_point = false; + if (c == '.') { + floating_point = true; + Advance(1); + if (!ConsumeDigits()) { + return SetError( + start, GetPosition(), + "floating point literal missing digits after decimal separator"); + } + } else { + Advance(1); + if (c == '0') { + if (ConsumeIgnoreCase('x')) { + if (!ConsumeHexDigits()) { + return SetError( + start, GetPosition(), + "integral literal missing digits after hexadecimal separator"); + } + auto token_type = ConsumeIntegralSuffix(); + if (ConsumeIf(IsIdentTrailing)) { + return SetError( + start, GetPosition(), + absl::StrCat(TokenNumericLiteralName(token_type), + " literal has unexpected trailing characters")); + } + return MakeToken(token_type, start, GetPosition()); + } + if (ConsumeIgnoreCase('b')) { + if (!ConsumeBinaryDigits()) { + return SetError( + start, GetPosition(), + "integral literal missing digits after binary separator"); + } + auto token_type = ConsumeIntegralSuffix(); + if (ConsumeIf(IsIdentTrailing)) { + return SetError( + start, GetPosition(), + absl::StrCat(TokenNumericLiteralName(token_type), + " literal has unexpected trailing characters")); + } + return MakeToken(token_type, start, GetPosition()); + } + } + static_cast(ConsumeDigits()); + if (text_ < text_end_ && *text_ == '.' && text_ + 1 < text_end_ && + absl::ascii_isdigit(text_[1])) { + floating_point = true; + Advance(1); + ConsumeDigits(); + } + } + if (ConsumeIgnoreCase('e')) { + floating_point = true; + static_cast(ConsumeIf(IsPlusOrMinus)); + if (!ConsumeDigits()) { + return SetError( + start, GetPosition(), + "floating point literal missing digits after exponent separator"); + } + } + auto token_type = + floating_point ? TokenType::kFloat : ConsumeIntegralSuffix(); + if (ConsumeIf(IsIdentTrailing)) { + return SetError( + start, GetPosition(), + absl::StrCat(TokenNumericLiteralName(token_type), + " literal has unexpected trailing characters")); + } + return MakeToken(token_type, start, GetPosition()); + } + if (c == '_' || absl::ascii_isalpha(c)) { + const char* text = text_; + Advance(1); + ConsumeIdentTrailing(); + // Since we have so few keywords, it's faster to use a switch statement + // on the first character rather than doing a map lookup. + int32_t end = GetPosition(); + std::string_view word(text, static_cast(end - start)); + switch (c) { + case 'f': + if (word == "false") { + return MakeToken(TokenType::kFalse, start, end); + } + break; + case 'i': + if (word == "in") { + return MakeToken(TokenType::kIn, start, end); + } + break; + case 'n': + if (word == "null") { + return MakeToken(TokenType::kNull, start, end); + } + break; + case 't': + if (word == "true") { + return MakeToken(TokenType::kTrue, start, end); + } + break; + default: + break; + } + return MakeToken(TokenType::kIdent, start, end); + } + Advance(1); + return SetError(start, GetPosition(), "unexpected character"); +} + +bool Lexer::ConsumeUntilAfter(char c) { + ABSL_DCHECK_NE(c, '\n'); + auto pos = GetRemainingText().find(c); + if (pos == std::string_view::npos) { + AdvanceProcessingNewLines(text_end_); + return false; + } + AdvanceProcessingNewLines(pos + 1); + return true; +} + +bool Lexer::ConsumeUntilAfterString(std::string_view s) { + ABSL_DCHECK(!absl::StrContains(s, '\n')); + auto pos = GetRemainingText().find(s); + if (pos == std::string_view::npos) { + AdvanceProcessingNewLines(text_end_); + return false; + } + AdvanceProcessingNewLines(pos + s.size()); + return true; +} + +bool Lexer::ConsumeUntilAfterUnescaped(char c) { + ABSL_DCHECK_NE(c, '\n'); + ABSL_DCHECK_NE(c, '\\'); + const char* text = text_; + bool escaped = false; + while (text != text_end_) { + std::string_view chunk = + std::string_view(text, static_cast(text_end_ - text)); + for (size_t i = 0; i < chunk.size(); ++i) { + char cc = chunk[i]; + if (cc == '\\') { + escaped = !escaped; + } else { + if (cc == c && !escaped) { + AdvanceProcessingNewLines(static_cast(text - text_) + i + 1); + return true; + } + escaped = false; + } + } + text += chunk.size(); + } + AdvanceProcessingNewLines(text_end_); + return false; +} + +void Lexer::ConsumeIdentTrailing() { + while (text_ != text_end_) { + std::string_view chunk = GetRemainingText(); + for (size_t i = 0; i < chunk.size(); ++i) { + char c = chunk[i]; + if (!IsIdentTrailing(c)) { + if (i > 0) { + Advance(i); + } + return; + } + } + Advance(chunk.size()); + } +} + +bool Lexer::MatchString(std::string_view s) const { + return absl::StartsWith(GetRemainingText(), s); +} + +bool Lexer::MatchStringIgnoreCase(std::string_view s) const { + return absl::StartsWithIgnoreCase(GetRemainingText(), s); +} + +std::optional Lexer::MatchIf( + absl::FunctionRef predicate) const { + if (text_ != text_end_) { + char c = *text_; + if (predicate(c)) { + return c; + } + } + return std::nullopt; +} + +void Lexer::ConsumeLine() { + while (text_ != text_end_) { + std::string_view chunk = GetRemainingText(); + auto pos = chunk.find('\n'); + if (pos != std::string_view::npos) { + Advance(pos + 1); + if (line_offsets_ != nullptr) { + line_offsets_->push_back(GetPosition()); + } + break; + } + Advance(chunk.size()); + } +} + +bool Lexer::ConsumeWhitespace() { + bool advanced = false; + while (text_ != text_end_) { + std::string_view chunk = GetRemainingText(); + size_t i = 0; + next_char: + while (i < chunk.size()) { + char c = chunk[i]; + switch (c) { + case '\n': + if (line_offsets_ != nullptr) { + line_offsets_->push_back(GetPosition() + static_cast(i) + + 1); + } + ABSL_FALLTHROUGH_INTENDED; + case ' ': + ABSL_FALLTHROUGH_INTENDED; + case '\r': + ABSL_FALLTHROUGH_INTENDED; + case '\v': + ABSL_FALLTHROUGH_INTENDED; + case '\t': + ++i; + goto next_char; + default: + if (i != 0) { + Advance(i); + return true; + } + return advanced; + } + } + Advance(chunk.size()); + advanced = true; + } + return advanced; +} + +bool Lexer::Consume(char c) { + ABSL_DCHECK_NE(c, '\n'); + if (Match(c)) { + Advance(1); + return true; + } + return false; +} + +bool Lexer::ConsumeIgnoreCase(char c) { + ABSL_DCHECK_NE(c, '\n'); + if (MatchIgnoreCase(c)) { + Advance(1); + return true; + } + return false; +} + +bool Lexer::ConsumeString(std::string_view s) { + ABSL_DCHECK(!absl::StrContains(s, '\n')); + if (MatchString(s)) { + Advance(s.size()); + return true; + } + return false; +} + +bool Lexer::ConsumeStringIgnoreCase(std::string_view s) { + ABSL_DCHECK(!absl::StrContains(s, '\n')); + if (MatchStringIgnoreCase(s)) { + Advance(s.size()); + return true; + } + return false; +} + +std::optional Lexer::ConsumeIf( + absl::FunctionRef predicate) { + std::optional match = MatchIf(predicate); + if (match.has_value()) { + ABSL_DCHECK_NE(*match, '\n'); + Advance(1); + } + return match; +} + +bool Lexer::ConsumeDigits() { + bool advanced = false; + while (text_ != text_end_) { + std::string_view chunk = GetRemainingText(); + for (size_t i = 0; i < chunk.size(); ++i) { + if (!absl::ascii_isdigit(chunk[i])) { + if (i != 0) { + Advance(i); + return true; + } + return advanced; + } + } + Advance(chunk.size()); + advanced = true; + } + return advanced; +} + +bool Lexer::ConsumeHexDigits() { + bool advanced = false; + while (text_ != text_end_) { + std::string_view chunk = GetRemainingText(); + for (size_t i = 0; i < chunk.size(); ++i) { + if (!absl::ascii_isxdigit(chunk[i])) { + if (i != 0) { + Advance(i); + return true; + } + return advanced; + } + } + Advance(chunk.size()); + advanced = true; + } + return advanced; +} + +bool Lexer::ConsumeBinaryDigits() { + bool advanced = false; + while (text_ != text_end_) { + std::string_view chunk = GetRemainingText(); + for (size_t i = 0; i < chunk.size(); ++i) { + if (!IsBinaryDigit(chunk[i])) { + if (i != 0) { + Advance(i); + return true; + } + return advanced; + } + } + Advance(chunk.size()); + advanced = true; + } + return advanced; +} + +TokenType Lexer::ConsumeIntegralSuffix() { + if (ConsumeIgnoreCase('u')) { + return TokenType::kUint; + } + return TokenType::kInt; +} + +void Lexer::AdvanceProcessingNewLines(size_t n) { + while (n > 0) { + std::string_view chunk = GetRemainingText(); + chunk = chunk.substr(0, std::min(chunk.size(), n)); + std::string_view::size_type pos = 0; + while (pos < chunk.size()) { + std::string_view::size_type npos = chunk.find('\n', pos); + if (npos == std::string_view::npos) { + break; + } + ++npos; + if (line_offsets_ != nullptr) { + line_offsets_->push_back(GetPosition() + static_cast(npos)); + } + pos = npos; + } + n -= chunk.size(); + Advance(chunk.size()); + } +} + +void Lexer::AdvanceProcessingNewLines(const char* end) { + ABSL_DCHECK_LE(end, text_end_); + ABSL_DCHECK_GE(end, text_); + AdvanceProcessingNewLines(static_cast(end - text_)); +} + +} // namespace cel_parser_internal diff --git a/parser/internal/lexer.h b/parser/internal/lexer.h new file mode 100644 index 000000000..2fbd6aa0f --- /dev/null +++ b/parser/internal/lexer.h @@ -0,0 +1,220 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef THIRD_PARTY_CEL_CPP_PARSER_INTERNAL_LEXER_H_ +#define THIRD_PARTY_CEL_CPP_PARSER_INTERNAL_LEXER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "absl/base/attributes.h" +#include "absl/base/nullability.h" +#include "absl/base/optimization.h" +#include "absl/functional/function_ref.h" +#include "absl/log/absl_check.h" +#include "absl/strings/ascii.h" + +namespace cel_parser_internal { + +enum class TokenType { + kError = 0, + kEnd, + kWhitespace, + kComment, + + // Keywords + kNull, + kFalse, + kTrue, + kIn, + + // Literals + kInt, + kUint, + kFloat, + kString, + kBytes, + + // Identifiers + kIdent, + + // Delimiters + kLeftBracket, // [ + kRightBracket, // ] + kLeftBrace, // { + kRightBrace, // } + kLeftParen, // ( + kRightParen, // ) + + // Operators + kDot, // . + kComma, // , + kMinus, // - + kPlus, // + + kAsterisk, // * + kSlash, // / + kPercent, // % + kQuestion, // ? + kColon, // : + kExclamation, // ! + kEqual, // = + kEqualEqual, // == + kExclamationEqual, // != + kLess, // < + kLessEqual, // <= + kGreater, // > + kGreaterEqual, // >= + kLogicalAnd, // && + kLogicalOr, // || +}; + +ABSL_ATTRIBUTE_PURE_FUNCTION std::string_view TokenTypeToString(TokenType type); + +struct Token final { + TokenType type = TokenType::kError; + int32_t start = 0; + int32_t end = 0; +}; + +struct LexerError final { + int32_t start = 0; + int32_t end = 0; + std::string message; +}; + +class Lexer final { + public: + explicit Lexer(std::string_view source, + std::vector* absl_nullable line_offsets = nullptr) + : line_offsets_(line_offsets), + text_begin_(source.data()), + text_end_(text_begin_ + source.size()), + text_(text_begin_) { + ABSL_DCHECK_LT(source.size(), + static_cast(std::numeric_limits::max())); + } + + Lexer(const Lexer&) = delete; + Lexer(Lexer&&) = delete; + Lexer& operator=(const Lexer&) = delete; + Lexer& operator=(Lexer&&) = delete; + + ABSL_ATTRIBUTE_NOINLINE + Token Lex(); + + const LexerError& GetError() const ABSL_ATTRIBUTE_LIFETIME_BOUND { + ABSL_DCHECK(at_error_); + return error_; + } + + int32_t GetPosition() const { + return static_cast(text_ - text_begin_); + } + + private: + int32_t Find(char c) const; + + bool Match(char c) const { return text_ != text_end_ && *text_ == c; } + + bool MatchIgnoreCase(char c) const { + return text_ != text_end_ && + absl::ascii_tolower(*text_) == absl::ascii_tolower(c); + } + + void Advance(size_t n) { + ABSL_DCHECK_LE(n, static_cast(text_end_ - text_)); + text_ += n; + } + + void AdvanceProcessingNewLines(size_t n); + void AdvanceProcessingNewLines(const char* end); + + std::string_view GetRemainingText() const { + return std::string_view(text_, static_cast(text_end_ - text_)); + } + + Token MakeToken(TokenType type, int32_t start, int32_t end) { + if (ABSL_PREDICT_FALSE(at_end_)) { + AtEndTokenCreated(); + } + return Token{.type = type, .start = start, .end = end}; + } + + Token SetError(int32_t start, int32_t end, std::string message) { + ABSL_DCHECK(!at_error_); + at_error_ = true; + error_ = + LexerError{.start = start, .end = end, .message = std::move(message)}; + return Token{.type = TokenType::kError, .start = start, .end = end}; + } + + void AtEndTokenCreated() { done_ = true; } + + bool ConsumeUntilAfter(char c); + + bool ConsumeUntilAfterString(std::string_view s); + + bool ConsumeUntilAfterUnescaped(char c); + + void ConsumeIdentTrailing(); + + bool MatchString(std::string_view s) const; + + bool MatchStringIgnoreCase(std::string_view s) const; + + std::optional MatchIf( + absl::FunctionRef predicate) const; + + void ConsumeLine(); + + bool ConsumeWhitespace(); + + bool Consume(char c); + + bool ConsumeIgnoreCase(char c); + + bool ConsumeString(std::string_view s); + + bool ConsumeStringIgnoreCase(std::string_view s); + + std::optional ConsumeIf( + absl::FunctionRef predicate); + + bool ConsumeDigits(); + + bool ConsumeHexDigits(); + + bool ConsumeBinaryDigits(); + + TokenType ConsumeIntegralSuffix(); + + std::vector* absl_nullable line_offsets_; + const char* absl_nonnull text_begin_; + const char* absl_nonnull text_end_; + const char* absl_nonnull text_; + bool at_end_ = false; + bool at_error_ = false; + bool done_ = false; + LexerError error_; +}; + +} // namespace cel_parser_internal + +#endif // THIRD_PARTY_CEL_CPP_PARSER_INTERNAL_LEXER_H_ diff --git a/parser/internal/lexer_test.cc b/parser/internal/lexer_test.cc new file mode 100644 index 000000000..324ca47fe --- /dev/null +++ b/parser/internal/lexer_test.cc @@ -0,0 +1,282 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "parser/internal/lexer.h" + +#include +#include +#include +#include +#include + +#include "internal/testing.h" + +namespace cel_parser_internal { +namespace { + +MATCHER_P3(IsToken, source, expected_type, expected_text, "") { + if (arg.type != expected_type) { + *result_listener << "type is " << TokenTypeToString(arg.type) + << " (expected " << TokenTypeToString(expected_type) + << ")"; + return false; + } + std::string_view actual_text = source.substr(arg.start, arg.end - arg.start); + if (actual_text != expected_text) { + *result_listener << "text is '" << actual_text << "' (expected '" + << expected_text << "')"; + return false; + } + return true; +} + +TEST(LexerTest, KeywordsAndIdents) { + std::string_view source = "null false true in foo_bar `quoted.ident`"; + Lexer lexer(source); + + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kNull, "null")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kFalse, "false")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kTrue, "true")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kIn, "in")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kIdent, "foo_bar")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), + IsToken(source, TokenType::kIdent, "`quoted.ident`")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kEnd, "")); +} + +TEST(LexerTest, Numbers) { + std::string_view source = + "123 45u 0x1A 0b101 3.14 .5 1e6 2.5e-3 45U 0x1Au 0x1AU"; + Lexer lexer(source); + + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kInt, "123")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kUint, "45u")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kInt, "0x1A")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kInt, "0b101")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kFloat, "3.14")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kFloat, ".5")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kFloat, "1e6")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kFloat, "2.5e-3")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kUint, "45U")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kUint, "0x1Au")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kUint, "0x1AU")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kEnd, "")); +} + +TEST(LexerTest, ZeroNumbers) { + std::string_view source = "0 0u 0x0 0b0"; + Lexer lexer(source); + + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kInt, "0")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kUint, "0u")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kInt, "0x0")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kInt, "0b0")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kEnd, "")); +} + +TEST(LexerTest, StringsAndBytes) { + std::string_view source = R"("hello" 'world' """multi +line""" r"raw" b"bytes" rb'\x00' '''multi +single''' R"raw_upper" B"bytes_upper" b'''multi +bytes''' br"raw_bytes" `a.b-c/d e` +"\a\b\f\n\r\t\v\"\'\\\?\` \x1A \u00A0 \U0001F600 \012")"; + Lexer lexer(source); + + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kString, "\"hello\"")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kString, "'world'")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), + IsToken(source, TokenType::kString, "\"\"\"multi\nline\"\"\"")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kString, "r\"raw\"")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kBytes, "b\"bytes\"")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kBytes, "rb'\\x00'")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), + IsToken(source, TokenType::kString, "'''multi\nsingle'''")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), + IsToken(source, TokenType::kString, "R\"raw_upper\"")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), + IsToken(source, TokenType::kBytes, "B\"bytes_upper\"")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), + IsToken(source, TokenType::kBytes, "b'''multi\nbytes'''")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), + IsToken(source, TokenType::kBytes, "br\"raw_bytes\"")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, " ")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kIdent, "`a.b-c/d e`")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, "\n")); + EXPECT_THAT(lexer.Lex(), + IsToken(source, TokenType::kString, + "\"\\a\\b\\f\\n\\r\\t\\v\\\"\\'\\\\\\?\\` \\x1A \\u00A0 " + "\\U0001F600 \\012\"")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kEnd, "")); +} + +TEST(LexerTest, OperatorsAndDelimiters) { + std::string_view source = + ". , + - * / % == != < <= > >= && || ! ? : [] { } ( )"; + Lexer lexer(source); + + std::pair expected[] = { + {TokenType::kDot, "."}, + {TokenType::kWhitespace, " "}, + {TokenType::kComma, ","}, + {TokenType::kWhitespace, " "}, + {TokenType::kPlus, "+"}, + {TokenType::kWhitespace, " "}, + {TokenType::kMinus, "-"}, + {TokenType::kWhitespace, " "}, + {TokenType::kAsterisk, "*"}, + {TokenType::kWhitespace, " "}, + {TokenType::kSlash, "/"}, + {TokenType::kWhitespace, " "}, + {TokenType::kPercent, "%"}, + {TokenType::kWhitespace, " "}, + {TokenType::kEqualEqual, "=="}, + {TokenType::kWhitespace, " "}, + {TokenType::kExclamationEqual, "!="}, + {TokenType::kWhitespace, " "}, + {TokenType::kLess, "<"}, + {TokenType::kWhitespace, " "}, + {TokenType::kLessEqual, "<="}, + {TokenType::kWhitespace, " "}, + {TokenType::kGreater, ">"}, + {TokenType::kWhitespace, " "}, + {TokenType::kGreaterEqual, ">="}, + {TokenType::kWhitespace, " "}, + {TokenType::kLogicalAnd, "&&"}, + {TokenType::kWhitespace, " "}, + {TokenType::kLogicalOr, "||"}, + {TokenType::kWhitespace, " "}, + {TokenType::kExclamation, "!"}, + {TokenType::kWhitespace, " "}, + {TokenType::kQuestion, "?"}, + {TokenType::kWhitespace, " "}, + {TokenType::kColon, ":"}, + {TokenType::kWhitespace, " "}, + {TokenType::kLeftBracket, "["}, + {TokenType::kRightBracket, "]"}, + {TokenType::kWhitespace, " "}, + {TokenType::kLeftBrace, "{"}, + {TokenType::kWhitespace, " "}, + {TokenType::kRightBrace, "}"}, + {TokenType::kWhitespace, " "}, + {TokenType::kLeftParen, "("}, + {TokenType::kWhitespace, " "}, + {TokenType::kRightParen, ")"}, + }; + + for (const auto& [t, text] : expected) { + EXPECT_THAT(lexer.Lex(), IsToken(source, t, text)); + } +} + +TEST(LexerTest, CommentsAndLineOffsets) { + std::string_view source = "a\n// comment\nb"; + std::vector line_offsets; + Lexer lexer(source, &line_offsets); + + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kIdent, "a")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kWhitespace, "\n")); + EXPECT_THAT(lexer.Lex(), + IsToken(source, TokenType::kComment, "// comment\n")); + EXPECT_THAT(lexer.Lex(), IsToken(source, TokenType::kIdent, "b")); + + ASSERT_EQ(line_offsets.size(), 2); + EXPECT_EQ(line_offsets[0], 2); + EXPECT_EQ(line_offsets[1], 13); +} + +struct LexerErrorTestCase { + std::string_view source; + std::string_view expected_error_message; + int32_t expected_position; +}; + +using LexerErrorTest = testing::TestWithParam; + +TEST_P(LexerErrorTest, LexesErrorTokenAndStoresError) { + const LexerErrorTestCase& test_case = GetParam(); + Lexer lexer(test_case.source); + Token token = lexer.Lex(); + EXPECT_EQ(token.type, TokenType::kError); + EXPECT_EQ(lexer.GetError().message, test_case.expected_error_message); + EXPECT_EQ(lexer.GetPosition(), test_case.expected_position); +} + +INSTANTIATE_TEST_SUITE_P( + ErrorCases, LexerErrorTest, + testing::Values( + LexerErrorTestCase{ + .source = "\"unterminated", + .expected_error_message = "unterminated string literal", + .expected_position = 13, + }, + LexerErrorTestCase{ + .source = "0x", + .expected_error_message = + "integral literal missing digits after hexadecimal separator", + .expected_position = 2, + }, + LexerErrorTestCase{ + .source = "@", + .expected_error_message = "unexpected character", + .expected_position = 1, + }, + LexerErrorTestCase{ + .source = "0x1A_invalid", + .expected_error_message = + "int literal has unexpected trailing characters", + .expected_position = 5, + }, + LexerErrorTestCase{ + .source = "0b101_invalid", + .expected_error_message = + "int literal has unexpected trailing characters", + .expected_position = 6, + }, + LexerErrorTestCase{ + .source = "123_invalid", + .expected_error_message = + "int literal has unexpected trailing characters", + .expected_position = 4, + })); + +} // namespace +} // namespace cel_parser_internal