/* * Copyright (C) 2009 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef YarrParser_h #define YarrParser_h #include "Yarr.h" #include #include #include namespace JSC { namespace Yarr { #define REGEXP_ERROR_PREFIX "Invalid regular expression: " enum BuiltInCharacterClassID { DigitClassID, SpaceClassID, WordClassID, NewlineClassID, }; // The Parser class should not be used directly - only via the Yarr::parse() method. template class Parser { private: template friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); enum ErrorCode { NoError, PatternTooLarge, QuantifierOutOfOrder, QuantifierWithoutAtom, QuantifierTooLarge, MissingParentheses, ParenthesesUnmatched, ParenthesesTypeInvalid, CharacterClassUnmatched, CharacterClassOutOfOrder, EscapeUnterminated, NumberOfErrorCodes }; /* * CharacterClassParserDelegate: * * The class CharacterClassParserDelegate is used in the parsing of character * classes. This class handles detection of character ranges. This class * implements enough of the delegate interface such that it can be passed to * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused * to perform the parsing of escape characters in character sets. */ class CharacterClassParserDelegate { public: CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) : m_delegate(delegate) , m_err(err) , m_state(Empty) , m_character(0) { } /* * begin(): * * Called at beginning of construction. */ void begin(bool invert) { m_delegate.atomCharacterClassBegin(invert); } /* * atomPatternCharacter(): * * This method is called either from parseCharacterClass() (for an unescaped * character in a character class), or from parseEscape(). In the former case * the value true will be passed for the argument 'hyphenIsRange', and in this * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ * is different to /[a\-z]/). */ void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) { switch (m_state) { case AfterCharacterClass: // Following a builtin character class we need look out for a hyphen. // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. // If we see a hyphen following a charater class then unlike usual // we'll report it to the delegate immediately, and put ourself into // a poisoned state. Any following calls to add another character or // character class will result in an error. (A hypen following a // character-class is itself valid, but only at the end of a regex). if (hyphenIsRange && ch == '-') { m_delegate.atomCharacterClassAtom('-'); m_state = AfterCharacterClassHyphen; return; } // Otherwise just fall through - cached character so treat this as Empty. case Empty: m_character = ch; m_state = CachedCharacter; return; case CachedCharacter: if (hyphenIsRange && ch == '-') m_state = CachedCharacterHyphen; else { m_delegate.atomCharacterClassAtom(m_character); m_character = ch; } return; case CachedCharacterHyphen: if (ch ()) , m_size(pattern.length()) , m_index(0) , m_parenthesesNestingDepth(0) { } /* * parseEscape(): * * Helper for parseTokens() AND parseCharacterClass(). * Unlike the other parser methods, this function does not report tokens * directly to the member delegate (m_delegate), instead tokens are * emitted to the delegate provided as an argument. In the case of atom * escapes, parseTokens() will call parseEscape() passing m_delegate as * an argument, and as such the escape will be reported to the delegate. * * However this method may also be used by parseCharacterClass(), in which * case a CharacterClassParserDelegate will be passed as the delegate that * tokens should be added to. A boolean flag is also provided to indicate * whether that an escape in a CharacterClass is being parsed (some parsing * rules change in this context). * * The boolean value returned by this method indicates whether the token * parsed was an atom (outside of a characted class \b and \B will be * interpreted as assertions). */ template bool parseEscape(EscapeDelegate& delegate) { ASSERT(!m_err); ASSERT(peek() == '\\'); consume(); if (atEndOfPattern()) { m_err = EscapeUnterminated; return false; } switch (peek()) { // Assertions case 'b': consume(); if (inCharacterClass) delegate.atomPatternCharacter('\b'); else { delegate.assertionWordBoundary(false); return false; } break; case 'B': consume(); if (inCharacterClass) delegate.atomPatternCharacter('B'); else { delegate.assertionWordBoundary(true); return false; } break; // CharacterClassEscape case 'd': consume(); delegate.atomBuiltInCharacterClass(DigitClassID, false); break; case 's': consume(); delegate.atomBuiltInCharacterClass(SpaceClassID, false); break; case 'w': consume(); delegate.atomBuiltInCharacterClass(WordClassID, false); break; case 'D': consume(); delegate.atomBuiltInCharacterClass(DigitClassID, true); break; case 'S': consume(); delegate.atomBuiltInCharacterClass(SpaceClassID, true); break; case 'W': consume(); delegate.atomBuiltInCharacterClass(WordClassID, true); break; // DecimalEscape case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. // First, try to parse this as backreference. if (!inCharacterClass) { ParseState state = saveState(); unsigned backReference = consumeNumber(); if (backReference = '8') { delegate.atomPatternCharacter('\\'); break; } // Fall-through to handle this as an octal escape. } // Octal escape case '0': delegate.atomPatternCharacter(consumeOctal()); break; // ControlEscape case 'f': consume(); delegate.atomPatternCharacter('\f'); break; case 'n': consume(); delegate.atomPatternCharacter('\n'); break; case 'r': consume(); delegate.atomPatternCharacter('\r'); break; case 't': consume(); delegate.atomPatternCharacter('\t'); break; case 'v': consume(); delegate.atomPatternCharacter('\v'); break; // ControlLetter case 'c': { ParseState state = saveState(); consume(); if (!atEndOfPattern()) { int control = consume(); // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { delegate.atomPatternCharacter(control & 0x1f); break; } } restoreState(state); delegate.atomPatternCharacter('\\'); break; } // HexEscape case 'x': { consume(); int x = tryConsumeHex(2); if (x == -1) delegate.atomPatternCharacter('x'); else delegate.atomPatternCharacter(x); break; } // UnicodeEscape case 'u': { consume(); int u = tryConsumeHex(4); if (u == -1) delegate.atomPatternCharacter('u'); else delegate.atomPatternCharacter(u); break; } // IdentityEscape default: delegate.atomPatternCharacter(consume()); } return true; } /* * parseAtomEscape(), parseCharacterClassEscape(): * * These methods alias to parseEscape(). */ bool parseAtomEscape() { return parseEscape(m_delegate); } void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) { parseEscape(delegate); } /* * parseCharacterClass(): * * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) * to an instance of CharacterClassParserDelegate, to describe the character class to the * delegate. */ void parseCharacterClass() { ASSERT(!m_err); ASSERT(peek() == '['); consume(); CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); characterClassConstructor.begin(tryConsume('^')); while (!atEndOfPattern()) { switch (peek()) { case ']': consume(); characterClassConstructor.end(); return; case '\\': parseCharacterClassEscape(characterClassConstructor); break; default: characterClassConstructor.atomPatternCharacter(consume(), true); } if (m_err) return; } m_err = CharacterClassUnmatched; } /* * parseParenthesesBegin(): * * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. */ void parseParenthesesBegin() { ASSERT(!m_err); ASSERT(peek() == '('); consume(); if (tryConsume('?')) { if (atEndOfPattern()) { m_err = ParenthesesTypeInvalid; return; } switch (consume()) { case ':': m_delegate.atomParenthesesSubpatternBegin(false); break; case '=': m_delegate.atomParentheticalAssertionBegin(); break; case '!': m_delegate.atomParentheticalAssertionBegin(true); break; default: m_err = ParenthesesTypeInvalid; } } else m_delegate.atomParenthesesSubpatternBegin(); ++m_parenthesesNestingDepth; } /* * parseParenthesesEnd(): * * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). */ void parseParenthesesEnd() { ASSERT(!m_err); ASSERT(peek() == ')'); consume(); if (m_parenthesesNestingDepth > 0) m_delegate.atomParenthesesEnd(); else m_err = ParenthesesUnmatched; --m_parenthesesNestingDepth; } /* * parseQuantifier(): * * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. */ void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) { ASSERT(!m_err); ASSERT(min 0) m_err = MissingParentheses; } /* * parse(): * * This method calls parseTokens() to parse over the input and converts any * error code to a const char* for a result. */ const char* parse() { if (m_size > MAX_PATTERN_SIZE) m_err = PatternTooLarge; else parseTokens(); ASSERT(atEndOfPattern() || m_err); // The order of this array must match the ErrorCode enum. static const char* errorMessages[NumberOfErrorCodes] = { 0, // NoError REGEXP_ERROR_PREFIX "regular expression too large", REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", REGEXP_ERROR_PREFIX "nothing to repeat", REGEXP_ERROR_PREFIX "number too large in {} quantifier", REGEXP_ERROR_PREFIX "missing )", REGEXP_ERROR_PREFIX "unmatched parentheses", REGEXP_ERROR_PREFIX "unrecognized character after (?", REGEXP_ERROR_PREFIX "missing terminating ] for character class", REGEXP_ERROR_PREFIX "range out of order in character class", REGEXP_ERROR_PREFIX "\\ at end of pattern" }; return errorMessages[m_err]; } // Misc helper functions: typedef unsigned ParseState; ParseState saveState() { return m_index; } void restoreState(ParseState state) { m_index = state; } bool atEndOfPattern() { ASSERT(m_index = n); ) { n = newValue; consume(); } return n; } unsigned consumeOctal() { ASSERT(WTF::isASCIIOctalDigit(peek())); unsigned n = consumeDigit(); while (n const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) { if (pattern.is8Bit()) return Parser(delegate, pattern, backReferenceLimit).parse(); return Parser(delegate, pattern, backReferenceLimit).parse(); } } } // namespace JSC::Yarr #endif // YarrParser_h