public abstract class RegexLexer extends Object
| Modifier and Type | Class and Description |
|---|---|
static class |
RegexLexer.ClassSetOperator |
static class |
RegexLexer.ParseGroupNameResult |
protected static class |
RegexLexer.ParseGroupNameResultState |
| Modifier and Type | Field and Description |
|---|---|
protected CompilationBuffer |
compilationBuffer |
protected static TBitSet |
DEFAULT_WHITESPACE |
protected Map<String,List<Integer>> |
namedCaptureGroups |
protected String |
pattern
The source of the input pattern.
|
protected int |
position
The index of the next character in
RegexLexer.pattern to be parsed. |
protected static TBitSet |
PREDEFINED_CHAR_CLASSES |
RegexSource |
source |
| Constructor and Description |
|---|
RegexLexer(RegexSource source,
CompilationBuffer compilationBuffer) |
| Modifier and Type | Method and Description |
|---|---|
protected void |
advance() |
protected void |
advance(int len) |
protected boolean |
atEnd() |
protected abstract long |
boundedQuantifierMaxValue()
The maximum value allowed while parsing bounded quantifiers.
|
protected abstract ClassSetContents |
caseFoldClassSetAtom(ClassSetContents classSetContents)
Case folds an atom in a class set expression.
|
protected abstract void |
caseFoldUnfold(CodePointSetAccumulator charClass)
Updates a character set by expanding it to the set of characters that case fold to the same
characters as the characters currently in the set.
|
protected abstract void |
checkClassSetCharacter(int codePoint)
Checks whether
codepoint can appear as an unescaped literal class set character. |
protected abstract CodePointSet |
complementClassSet(CodePointSet codePointSet)
Returns the complement of a class set element.
|
protected char |
consumeChar() |
protected boolean |
consumingLookahead(Predicate<Character> predicate,
int length) |
protected boolean |
consumingLookahead(String match) |
protected int |
count(Predicate<Character> predicate) |
protected int |
count(Predicate<Character> predicate,
int fromIndex,
int toIndex) |
protected int |
countDecimalDigits() |
protected int |
countFrom(Predicate<Character> predicate,
int fromIndex) |
protected int |
countUpTo(Predicate<Character> predicate,
int max) |
protected char |
curChar() |
protected abstract boolean |
featureEnabledAZPositionAssertions()
Returns
true if \A and \Z position assertions are supported. |
protected abstract boolean |
featureEnabledBoundedQuantifierEmptyMin()
Returns
true if empty minimum values in bounded quantifiers (e.g. |
protected abstract boolean |
featureEnabledCharClassFirstBracketIsLiteral()
Returns
true if the first character in a character class must be interpreted as part
of the character set, even if it is the closing bracket ']'. |
protected abstract boolean |
featureEnabledClassSetExpressions()
Returns
true if class set expressions (e.g. |
protected abstract boolean |
featureEnabledForwardReferences()
Returns
true if forward references are allowed. |
protected abstract boolean |
featureEnabledGroupComments()
Returns
true if group comments (e.g. |
protected abstract boolean |
featureEnabledIgnoreCase()
Returns
true if ignore-case mode is currently enabled. |
protected abstract boolean |
featureEnabledIgnoreWhiteSpace()
Returns
true if white space in the pattern is ignored. |
protected abstract boolean |
featureEnabledLineComments()
Returns
true if line comments (e.g. |
protected abstract boolean |
featureEnabledNestedCharClasses()
Returns
true if nested character classes are supported. |
protected abstract boolean |
featureEnabledOctalEscapes()
Returns
true if octal escapes (e.g. |
protected abstract boolean |
featureEnabledPOSIXCharClasses()
Returns
true if POSIX character classes, character equivalence classes, and the POSIX
Collating Element Operator are supported. |
protected abstract boolean |
featureEnabledPossessiveQuantifiers()
Returns
true if possessive quantifiers (+ suffix) are allowed. |
protected abstract boolean |
featureEnabledSpecialGroups()
Returns
true if any constructs that alter a capture group's function, such as
non-capturing groups (?:) or look-around assertions (?=), are supported. |
protected abstract boolean |
featureEnabledUnicodePropertyEscapes()
Returns
true if unicode property escapes (e.g. |
protected abstract boolean |
featureEnabledWordBoundaries()
Returns
true if \w and \W word boundary position assertions are
supported. |
protected abstract boolean |
featureEnabledZLowerCaseAssertion()
Returns
true if \z position assertion is supported. |
protected boolean |
findChars(char... chars) |
protected int |
finishSurrogatePair(char c) |
protected abstract CodePointSet |
getDotCodePointSet()
Returns the code point set represented by the dot operator.
|
protected abstract CodePointSet |
getIdContinue()
Returns the set of all codepoints a group identifier may continue with.
|
protected abstract CodePointSet |
getIdStart()
Returns the set of all codepoints a group identifier may begin with.
|
protected int |
getLastAtomPosition() |
int |
getLastCharacterClassBeginPosition() |
int |
getLastTokenPosition()
Returns the last token's position in the pattern string.
|
protected abstract int |
getMaxBackReferenceDigits()
Returns the maximum number of digits to parse when parsing a back-reference.
|
Map<String,List<Integer>> |
getNamedCaptureGroups() |
protected abstract CodePointSet |
getPOSIXCharClass(String name)
Returns the POSIX character class associated to the given name.
|
protected abstract CodePointSet |
getPredefinedCharClass(char c)
Returns the CodePointSet associated with the given predefined character class (e.g.
|
protected abstract TBitSet |
getWhitespace()
The set of codepoints to consider as whitespace in comments and "ignore white space" mode.
|
protected abstract RegexSyntaxException |
handleBoundedQuantifierOutOfOrder()
Handle
{2,1}. |
protected abstract Token |
handleBoundedQuantifierOverflow(long min,
long max)
Handle integer overflows in quantifier bounds, e.g.
|
protected abstract Token |
handleBoundedQuantifierOverflowMin(long min,
long max)
Handle integer overflows in quantifier bounds, e.g.
|
protected abstract Token |
handleBoundedQuantifierSyntaxError()
Handle syntax errors in bounded quantifiers (missing }, non-digit characters).
|
protected abstract RegexSyntaxException |
handleCCRangeOutOfOrder(int startPos)
Handle out of order character class range elements, e.g.
|
protected abstract void |
handleCCRangeWithPredefCharClass(int startPos,
ClassSetContents firstAtom,
ClassSetContents secondAtom)
Handle non-codepoint character class range elements, e.g.
|
protected abstract RegexSyntaxException |
handleComplementOfStringSet()
Handle complement of class set expressions containing strings, e.g.
|
protected abstract RegexSyntaxException |
handleEmptyGroupName()
Handle empty group name in group references.
|
protected abstract void |
handleGroupRedefinition(String name,
int newId,
int oldId) |
protected abstract void |
handleIncompleteEscapeX()
Handle incomplete hex escapes, e.g.
|
protected abstract void |
handleInvalidBackReference(int reference)
Handle group references to non-existent groups.
|
protected abstract void |
handleInvalidBackReference(String reference)
Handle group references to non-existent groups.
|
protected abstract RegexSyntaxException |
handleInvalidCharInCharClass() |
protected abstract RegexSyntaxException |
handleInvalidGroupBeginQ()
Handle groups starting with
(? and invalid next char. |
protected abstract RegexSyntaxException |
handleMissingClassSetOperand(RegexLexer.ClassSetOperator operator)
Handle missing operands in class set expressions, e.g.
|
protected abstract RegexSyntaxException |
handleMixedClassSetOperators(RegexLexer.ClassSetOperator leftOperator,
RegexLexer.ClassSetOperator rightOperator)
Handle class set expressions with mixed set operators in the same nested set.
|
protected abstract void |
handleOctalOutOfRange()
Handle octal values larger than 255.
|
protected abstract RegexSyntaxException |
handleRangeAsClassSetOperand(RegexLexer.ClassSetOperator operator)
Handle character ranges as operands in class set expressions with operators other than union.
|
protected abstract void |
handleUnfinishedEscape()
Handle unfinished escape (e.g.
|
protected abstract void |
handleUnfinishedGroupComment()
Handle unfinished group comment
(#...). |
protected abstract RegexSyntaxException |
handleUnfinishedGroupQ()
Handle unfinished group with question mark
(?. |
protected abstract RegexSyntaxException |
handleUnfinishedRangeInClassSet()
Handle unfinished range in class set expression
[a-]. |
protected abstract RegexSyntaxException |
handleUnmatchedLeftBracket()
Handle unmatched
[. |
protected abstract void |
handleUnmatchedRightBrace()
Handle unmatched }.
|
protected abstract void |
handleUnmatchedRightBracket()
Handle unmatched
]. |
protected boolean |
hasNamedCaptureGroups()
Checks whether this regular expression contains any named capture groups.
|
boolean |
hasNext() |
boolean |
inCharacterClass() |
static boolean |
isAscii(int c) |
boolean |
isCurCharClassInverted() |
static boolean |
isDecimalDigit(int c) |
protected boolean |
isEscaped() |
static boolean |
isHexDigit(int c) |
static boolean |
isOctalDigit(int c) |
protected boolean |
isPredefCharClass(char c)
Returns
true iff the given character is a predefined character class when preceded
with a backslash (e.g. |
protected Token |
literalChar(int codePoint) |
protected boolean |
lookahead(Predicate<Character> predicate,
int length) |
protected boolean |
lookahead(String match) |
protected boolean |
lookbehind(char c) |
Token |
next() |
int |
numberOfCaptureGroupsSoFar() |
protected abstract int |
parseCodePointInGroupName()
Parse the next codepoint in a group name and return it.
|
protected abstract Token |
parseCustomEscape(char c)
Parse any escape sequence starting with
\ and the argument c. |
protected abstract int |
parseCustomEscapeChar(char c,
boolean inCharClass)
Parse an escape character sequence (inside character class, or other escapes have already
been tried) starting with
\ and the argument {code c}. |
protected abstract int |
parseCustomEscapeCharFallback(int c,
boolean inCharClass)
Parse an escape character sequence (inside character class, or other escapes have already
been tried) starting with
\ and the code point c.This method is called after
all other means of parsing the escape sequence have been exhausted. |
protected abstract Token |
parseCustomGroupBeginQ(char charAfterQuestionMark)
Parse group starting with
(?. |
protected abstract Token |
parseGroupLt()
Parse group starting with
(<. |
protected RegexLexer.ParseGroupNameResult |
parseGroupName(char terminator)
Parse a
GroupName, i.e. |
protected int |
parseIntSaturated(int firstDigit,
int length,
int returnOnOverflow) |
protected long |
parseIntSaturated(int firstDigit,
int length,
int returnOnOverflow,
long maxValue) |
protected int |
parseOctal(int firstDigit) |
protected void |
registerNamedCaptureGroup(String name) |
protected void |
retreat() |
RegexSyntaxException |
syntaxError(String msg) |
int |
totalNumberOfCaptureGroups() |
protected abstract void |
validatePOSIXCollationElement(String sequence)
Checks if the given string is a valid collation element.
|
protected abstract void |
validatePOSIXEquivalenceClass(String sequence)
Checks if the given string is a valid equivalence class.
|
protected static final TBitSet PREDEFINED_CHAR_CLASSES
protected static final TBitSet DEFAULT_WHITESPACE
public final RegexSource source
protected final String pattern
protected int position
RegexLexer.pattern to be parsed.protected final CompilationBuffer compilationBuffer
public RegexLexer(RegexSource source, CompilationBuffer compilationBuffer)
protected abstract boolean featureEnabledIgnoreCase()
true if ignore-case mode is currently enabled.protected abstract boolean featureEnabledAZPositionAssertions()
true if \A and \Z position assertions are supported.protected abstract boolean featureEnabledZLowerCaseAssertion()
true if \z position assertion is supported.protected abstract boolean featureEnabledWordBoundaries()
true if \w and \W word boundary position assertions are
supported.protected abstract boolean featureEnabledBoundedQuantifierEmptyMin()
true if empty minimum values in bounded quantifiers (e.g. {,1}) are
allowed and treated as zero.protected abstract boolean featureEnabledPossessiveQuantifiers()
true if possessive quantifiers (+ suffix) are allowed.protected abstract boolean featureEnabledCharClassFirstBracketIsLiteral()
true if the first character in a character class must be interpreted as part
of the character set, even if it is the closing bracket ']'.protected abstract boolean featureEnabledNestedCharClasses()
true if nested character classes are supported. This is required for
RegexLexer.featureEnabledPOSIXCharClasses() .protected abstract boolean featureEnabledPOSIXCharClasses()
true if POSIX character classes, character equivalence classes, and the POSIX
Collating Element Operator are supported. Requires
RegexLexer.featureEnabledNestedCharClasses().protected abstract CodePointSet getPOSIXCharClass(String name)
protected abstract void validatePOSIXCollationElement(String sequence)
protected abstract void validatePOSIXEquivalenceClass(String sequence)
protected abstract boolean featureEnabledForwardReferences()
true if forward references are allowed.protected abstract boolean featureEnabledGroupComments()
true if group comments (e.g. (# ... )) are supported.protected abstract boolean featureEnabledLineComments()
true if line comments (e.g. # ... ) are supported.protected abstract boolean featureEnabledIgnoreWhiteSpace()
true if white space in the pattern is ignored. This is relevant only if line
comments are not supported.protected abstract TBitSet getWhitespace()
protected abstract boolean featureEnabledOctalEscapes()
true if octal escapes (e.g. \012) are supported.protected abstract boolean featureEnabledSpecialGroups()
true if any constructs that alter a capture group's function, such as
non-capturing groups (?:) or look-around assertions (?=), are supported. If
this flag is false, groups starting with a question mark (? do not have any
special meaning.protected abstract boolean featureEnabledUnicodePropertyEscapes()
true if unicode property escapes (e.g. \p{...}) are supported.protected abstract boolean featureEnabledClassSetExpressions()
true if class set expressions (e.g. [[\w\q{abc|xyz}]--[a-cx-z]]) are
supported.protected abstract void caseFoldUnfold(CodePointSetAccumulator charClass)
protected abstract ClassSetContents caseFoldClassSetAtom(ClassSetContents classSetContents)
protected abstract CodePointSet complementClassSet(CodePointSet codePointSet)
protected abstract CodePointSet getDotCodePointSet()
protected abstract CodePointSet getIdStart()
protected abstract CodePointSet getIdContinue()
protected abstract int getMaxBackReferenceDigits()
protected boolean isPredefCharClass(char c)
true iff the given character is a predefined character class when preceded
with a backslash (e.g. \d).protected abstract CodePointSet getPredefinedCharClass(char c)
\d).
Note that the CodePointSet returned by this function has already been case-folded and negated.
protected abstract long boundedQuantifierMaxValue()
RegexLexer.handleBoundedQuantifierOverflow(long, long).protected abstract RegexSyntaxException handleBoundedQuantifierOutOfOrder()
{2,1}.protected abstract Token handleBoundedQuantifierSyntaxError()
protected abstract Token handleBoundedQuantifierOverflow(long min, long max)
{2147483649}. If this method
returns a non-null value, it will be returned instead of the current quantifier.protected abstract Token handleBoundedQuantifierOverflowMin(long min, long max)
{2147483649}. If this method
returns a non-null value, it will be returned instead of the current quantifier. This method
is called when no explicit max value is present.protected abstract RegexSyntaxException handleCCRangeOutOfOrder(int startPos)
[b-a].protected abstract void handleCCRangeWithPredefCharClass(int startPos,
ClassSetContents firstAtom,
ClassSetContents secondAtom)
[\w-a].protected abstract RegexSyntaxException handleComplementOfStringSet()
[^\q{abc}] or
\P{RGI_Emoji}.protected abstract RegexSyntaxException handleEmptyGroupName()
protected abstract void handleGroupRedefinition(String name, int newId, int oldId)
protected abstract void handleIncompleteEscapeX()
\x1.protected abstract void handleInvalidBackReference(int reference)
protected abstract void handleInvalidBackReference(String reference)
protected abstract RegexSyntaxException handleInvalidCharInCharClass()
protected abstract RegexSyntaxException handleInvalidGroupBeginQ()
(? and invalid next char.protected abstract RegexSyntaxException handleMixedClassSetOperators(RegexLexer.ClassSetOperator leftOperator, RegexLexer.ClassSetOperator rightOperator)
protected abstract RegexSyntaxException handleMissingClassSetOperand(RegexLexer.ClassSetOperator operator)
[\s&&] or [\w--].protected abstract void handleOctalOutOfRange()
protected abstract RegexSyntaxException handleRangeAsClassSetOperand(RegexLexer.ClassSetOperator operator)
protected abstract void handleUnfinishedEscape()
\).protected abstract void handleUnfinishedGroupComment()
(#...).protected abstract RegexSyntaxException handleUnfinishedGroupQ()
(?.protected abstract RegexSyntaxException handleUnfinishedRangeInClassSet()
[a-].protected abstract void handleUnmatchedRightBrace()
protected abstract RegexSyntaxException handleUnmatchedLeftBracket()
[.protected abstract void handleUnmatchedRightBracket()
].protected abstract void checkClassSetCharacter(int codePoint)
throws RegexSyntaxException
codepoint can appear as an unescaped literal class set character.RegexSyntaxExceptionprotected abstract int parseCodePointInGroupName()
throws RegexSyntaxException
RegexSyntaxExceptionprotected abstract Token parseCustomEscape(char c)
\ and the argument c.protected abstract int parseCustomEscapeChar(char c,
boolean inCharClass)
\ and the argument {code c}.protected abstract int parseCustomEscapeCharFallback(int c,
boolean inCharClass)
\ and the code point c.This method is called after
all other means of parsing the escape sequence have been exhausted.protected abstract Token parseCustomGroupBeginQ(char charAfterQuestionMark)
(?.protected abstract Token parseGroupLt()
(<.protected boolean findChars(char... chars)
protected void advance()
protected void retreat()
public boolean hasNext()
public Token next() throws RegexSyntaxException
RegexSyntaxExceptionpublic int getLastTokenPosition()
public int getLastCharacterClassBeginPosition()
protected int getLastAtomPosition()
protected char curChar()
protected char consumeChar()
protected void advance(int len)
protected boolean lookahead(String match)
protected boolean consumingLookahead(String match)
protected boolean lookbehind(char c)
protected boolean isEscaped()
protected boolean atEnd()
public boolean inCharacterClass()
public boolean isCurCharClassInverted()
public int totalNumberOfCaptureGroups()
throws RegexSyntaxException
RegexSyntaxExceptionpublic int numberOfCaptureGroupsSoFar()
public Map<String,List<Integer>> getNamedCaptureGroups() throws RegexSyntaxException
RegexSyntaxExceptionprotected boolean hasNamedCaptureGroups()
throws RegexSyntaxException
This method is a way to check whether we are parsing the goal symbol Pattern[~U, +N] or Pattern[~U, ~N] (see the ECMAScript RegExp grammar).
RegexSyntaxExceptionprotected void registerNamedCaptureGroup(String name)
protected Token literalChar(int codePoint)
protected RegexLexer.ParseGroupNameResult parseGroupName(char terminator) throws RegexSyntaxException
GroupName, i.e. <RegExpIdentifierName>, assuming that the opening
< bracket was already read.RegExpIdentifierNameRegexSyntaxExceptionprotected int parseIntSaturated(int firstDigit,
int length,
int returnOnOverflow)
protected long parseIntSaturated(int firstDigit,
int length,
int returnOnOverflow,
long maxValue)
protected int countDecimalDigits()
protected int finishSurrogatePair(char c)
protected int parseOctal(int firstDigit)
public RegexSyntaxException syntaxError(String msg)
public static boolean isDecimalDigit(int c)
public static boolean isOctalDigit(int c)
public static boolean isHexDigit(int c)
public static boolean isAscii(int c)