1 // class template regex -*- C++ -*-
 
    3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
 
    5 // This file is part of the GNU ISO C++ Library.  This library is free
 
    6 // software; you can redistribute it and/or modify it under the
 
    7 // terms of the GNU General Public License as published by the
 
    8 // Free Software Foundation; either version 3, or (at your option)
 
   11 // This library is distributed in the hope that it will be useful,
 
   12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 
   13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
   14 // GNU General Public License for more details.
 
   16 // Under Section 7 of GPL version 3, you are granted additional
 
   17 // permissions described in the GCC Runtime Library Exception, version
 
   18 // 3.1, as published by the Free Software Foundation.
 
   20 // You should have received a copy of the GNU General Public License and
 
   21 // a copy of the GCC Runtime Library Exception along with this program;
 
   22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 
   23 // <http://www.gnu.org/licenses/>.
 
   26  *  @file bits/regex_scanner.tcc
 
   27  *  This is an internal header file, included by other library headers.
 
   28  *  Do not attempt to use it directly. @headername{regex}
 
   31 // FIXME make comments doxygen format.
 
   33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
 
   35 // 1) grep is basic except '\n' is treated as '|'
 
   36 // 2) egrep is extended except '\n' is treated as '|'
 
   37 // 3) awk is extended except special escaping rules, and there's no
 
   42 // ECMAScript: ECMA-262 15.10
 
   45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
 
   47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
 
   49 namespace std _GLIBCXX_VISIBILITY(default)
 
   53 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   55   template<typename _CharT>
 
   57     _Scanner(typename _Scanner::_IterT __begin,
 
   58         typename _Scanner::_IterT __end,
 
   59         _FlagT __flags, std::locale __loc)
 
   60     : _ScannerBase(__flags),
 
   61       _M_current(__begin), _M_end(__end),
 
   62       _M_ctype(std::use_facet<_CtypeT>(__loc)),
 
   63       _M_eat_escape(_M_is_ecma()
 
   64            ? &_Scanner::_M_eat_escape_ecma
 
   65            : &_Scanner::_M_eat_escape_posix)
 
   68   template<typename _CharT>
 
   73       if (_M_current == _M_end)
 
   75      _M_token = _S_token_eof;
 
   79       if (_M_state == _S_state_normal)
 
   81       else if (_M_state == _S_state_in_bracket)
 
   83       else if (_M_state == _S_state_in_brace)
 
   86    _GLIBCXX_DEBUG_ASSERT(false);
 
   89   // Differences between styles:
 
   90   // 1) "\(", "\)", "\{" in basic. It's not escaping.
 
   91   // 2) "(?:", "(?=", "(?!" in ECMAScript.
 
   92   template<typename _CharT>
 
   97       auto __c = *_M_current++;
 
  102      if (_M_current == _M_end)
 
  103        __throw_regex_error(regex_constants::error_escape);
 
  106          || (*_M_current != '('
 
  107          && *_M_current != ')'
 
  108          && *_M_current != '{'))
 
  110          (this->*_M_eat_escape)();
 
  117      if (_M_is_ecma() && *_M_current == '?')
 
  119          if (++_M_current == _M_end)
 
  120        __throw_regex_error(regex_constants::error_paren);
 
  122          if (*_M_current == ':')
 
  125          _M_token = _S_token_subexpr_no_group_begin;
 
  127          else if (*_M_current == '=')
 
  130          _M_token = _S_token_subexpr_lookahead_begin;
 
  131          _M_value.assign(1, 'p');
 
  133          else if (*_M_current == '!')
 
  136          _M_token = _S_token_subexpr_lookahead_begin;
 
  137          _M_value.assign(1, 'n');
 
  140        __throw_regex_error(regex_constants::error_paren);
 
  142      else if (_M_flags & regex_constants::nosubs)
 
  143        _M_token = _S_token_subexpr_no_group_begin;
 
  145        _M_token = _S_token_subexpr_begin;
 
  148    _M_token = _S_token_subexpr_end;
 
  151      _M_state = _S_state_in_bracket;
 
  152      _M_at_bracket_start = true;
 
  153      if (_M_current != _M_end && *_M_current == '^')
 
  155          _M_token = _S_token_bracket_neg_begin;
 
  159        _M_token = _S_token_bracket_begin;
 
  163      _M_state = _S_state_in_brace;
 
  164      _M_token = _S_token_interval_begin;
 
  166       else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')))
 
  171           || (_M_is_grep() && __c == '\n'))
 
  173      auto __it = _M_token_tbl;
 
  174      auto __narrowc = _M_ctype.narrow(__c, '\0');
 
  175      for (; __it->first != '\0'; ++__it)
 
  176        if (__it->first == __narrowc)
 
  178        _M_token = __it->second;
 
  181      _GLIBCXX_DEBUG_ASSERT(false);
 
  185      _M_token = _S_token_ord_char;
 
  186      _M_value.assign(1, __c);
 
  190   // Differences between styles:
 
  191   // 1) different semantics of "[]" and "[^]".
 
  192   // 2) Escaping in bracket expr.
 
  193   template<typename _CharT>
 
  198       if (_M_current == _M_end)
 
  199    __throw_regex_error(regex_constants::error_brack);
 
  201       auto __c = *_M_current++;
 
  205      if (_M_current == _M_end)
 
  206        __throw_regex_error(regex_constants::error_brack);
 
  208      if (*_M_current == '.')
 
  210          _M_token = _S_token_collsymbol;
 
  211          _M_eat_class(*_M_current++);
 
  213      else if (*_M_current == ':')
 
  215          _M_token = _S_token_char_class_name;
 
  216          _M_eat_class(*_M_current++);
 
  218      else if (*_M_current == '=')
 
  220          _M_token = _S_token_equiv_class_name;
 
  221          _M_eat_class(*_M_current++);
 
  225          _M_token = _S_token_ord_char;
 
  226          _M_value.assign(1, __c);
 
  229       // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
 
  230       // literally. So "[]]" or "[^]]" is valid regex. See the testcases
 
  231       // `*/empty_range.cc`.
 
  232       else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
 
  234      _M_token = _S_token_bracket_end;
 
  235      _M_state = _S_state_normal;
 
  237       // ECMAScirpt and awk permmits escaping in bracket.
 
  238       else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
 
  239    (this->*_M_eat_escape)();
 
  242      _M_token = _S_token_ord_char;
 
  243      _M_value.assign(1, __c);
 
  245       _M_at_bracket_start = false;
 
  248   // Differences between styles:
 
  249   // 1) "\}" in basic style.
 
  250   template<typename _CharT>
 
  255       if (_M_current == _M_end)
 
  256    __throw_regex_error(regex_constants::error_brace);
 
  258       auto __c = *_M_current++;
 
  260       if (_M_ctype.is(_CtypeT::digit, __c))
 
  262      _M_token = _S_token_dup_count;
 
  263      _M_value.assign(1, __c);
 
  264      while (_M_current != _M_end
 
  265         && _M_ctype.is(_CtypeT::digit, *_M_current))
 
  266        _M_value += *_M_current++;
 
  269    _M_token = _S_token_comma;
 
  271       else if (_M_is_basic())
 
  273      if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
 
  275          _M_state = _S_state_normal;
 
  276          _M_token = _S_token_interval_end;
 
  280        __throw_regex_error(regex_constants::error_badbrace);
 
  284      _M_state = _S_state_normal;
 
  285      _M_token = _S_token_interval_end;
 
  288    __throw_regex_error(regex_constants::error_badbrace);
 
  291   template<typename _CharT>
 
  296       if (_M_current == _M_end)
 
  297    __throw_regex_error(regex_constants::error_escape);
 
  299       auto __c = *_M_current++;
 
  300       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
 
  302       if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
 
  304      _M_token = _S_token_ord_char;
 
  305      _M_value.assign(1, *__pos);
 
  309      _M_token = _S_token_word_bound;
 
  310      _M_value.assign(1, 'p');
 
  314      _M_token = _S_token_word_bound;
 
  315      _M_value.assign(1, 'n');
 
  325      _M_token = _S_token_quoted_class;
 
  326      _M_value.assign(1, __c);
 
  330      if (_M_current == _M_end)
 
  331        __throw_regex_error(regex_constants::error_escape);
 
  332      _M_token = _S_token_ord_char;
 
  333      _M_value.assign(1, *_M_current++);
 
  335       else if (__c == 'x' || __c == 'u')
 
  338      for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
 
  340          if (_M_current == _M_end
 
  341          || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
 
  342        __throw_regex_error(regex_constants::error_escape);
 
  343          _M_value += *_M_current++;
 
  345      _M_token = _S_token_hex_num;
 
  347       // ECMAScript recongnizes multi-digit back-references.
 
  348       else if (_M_ctype.is(_CtypeT::digit, __c))
 
  350      _M_value.assign(1, __c);
 
  351      while (_M_current != _M_end
 
  352         && _M_ctype.is(_CtypeT::digit, *_M_current))
 
  353        _M_value += *_M_current++;
 
  354      _M_token = _S_token_backref;
 
  358      _M_token = _S_token_ord_char;
 
  359      _M_value.assign(1, __c);
 
  363   // Differences between styles:
 
  364   // 1) Extended doesn't support backref, but basic does.
 
  365   template<typename _CharT>
 
  368     _M_eat_escape_posix()
 
  370       if (_M_current == _M_end)
 
  371    __throw_regex_error(regex_constants::error_escape);
 
  373       auto __c = *_M_current;
 
  374       auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
 
  376       if (__pos != nullptr && *__pos != '\0')
 
  378      _M_token = _S_token_ord_char;
 
  379      _M_value.assign(1, __c);
 
  381       // We MUST judge awk before handling backrefs. There's no backref in awk.
 
  382       else if (_M_is_awk())
 
  387       else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
 
  389      _M_token = _S_token_backref;
 
  390      _M_value.assign(1, __c);
 
  394 #ifdef __STRICT_ANSI__
 
  395      __throw_regex_error(regex_constants::error_escape);
 
  397      _M_token = _S_token_ord_char;
 
  398      _M_value.assign(1, __c);
 
  404   template<typename _CharT>
 
  409       auto __c = *_M_current++;
 
  410       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
 
  412       if (__pos != nullptr)
 
  414      _M_token = _S_token_ord_char;
 
  415      _M_value.assign(1, *__pos);
 
  417       // \ddd for oct representation
 
  418       else if (_M_ctype.is(_CtypeT::digit, __c)
 
  422      _M_value.assign(1,  __c);
 
  425           && _M_current != _M_end
 
  426           && _M_ctype.is(_CtypeT::digit, *_M_current)
 
  427           && *_M_current != '8'
 
  428           && *_M_current != '9';
 
  430        _M_value += *_M_current++;
 
  431      _M_token = _S_token_oct_num;
 
  435    __throw_regex_error(regex_constants::error_escape);
 
  438   // Eats a character class or throwns an exception.
 
  439   // __ch cound be ':', '.' or '=', _M_current is the char after ']' when
 
  441   template<typename _CharT>
 
  444     _M_eat_class(char __ch)
 
  446       for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
 
  447    _M_value += *_M_current++;
 
  448       if (_M_current == _M_end
 
  449      || *_M_current++ != __ch
 
  450      || _M_current == _M_end // skip __ch
 
  451      || *_M_current++ != ']') // skip ']'
 
  454        __throw_regex_error(regex_constants::error_ctype);
 
  456        __throw_regex_error(regex_constants::error_collate);
 
  460 #ifdef _GLIBCXX_DEBUG
 
  461   template<typename _CharT>
 
  464     _M_print(std::ostream& ostr)
 
  468       case _S_token_anychar:
 
  469    ostr << "any-character\n";
 
  471       case _S_token_backref:
 
  474       case _S_token_bracket_begin:
 
  475    ostr << "bracket-begin\n";
 
  477       case _S_token_bracket_neg_begin:
 
  478    ostr << "bracket-neg-begin\n";
 
  480       case _S_token_bracket_end:
 
  481    ostr << "bracket-end\n";
 
  483       case _S_token_char_class_name:
 
  484    ostr << "char-class-name \"" << _M_value << "\"\n";
 
  486       case _S_token_closure0:
 
  487    ostr << "closure0\n";
 
  489       case _S_token_closure1:
 
  490    ostr << "closure1\n";
 
  492       case _S_token_collsymbol:
 
  493    ostr << "collsymbol \"" << _M_value << "\"\n";
 
  498       case _S_token_dup_count:
 
  499    ostr << "dup count: " << _M_value << "\n";
 
  504       case _S_token_equiv_class_name:
 
  505    ostr << "equiv-class-name \"" << _M_value << "\"\n";
 
  507       case _S_token_interval_begin:
 
  508    ostr << "interval begin\n";
 
  510       case _S_token_interval_end:
 
  511    ostr << "interval end\n";
 
  513       case _S_token_line_begin:
 
  514    ostr << "line begin\n";
 
  516       case _S_token_line_end:
 
  517    ostr << "line end\n";
 
  525       case _S_token_ord_char:
 
  526    ostr << "ordinary character: \"" << _M_value << "\"\n";
 
  528       case _S_token_subexpr_begin:
 
  529    ostr << "subexpr begin\n";
 
  531       case _S_token_subexpr_no_group_begin:
 
  532    ostr << "no grouping subexpr begin\n";
 
  534       case _S_token_subexpr_lookahead_begin:
 
  535    ostr << "lookahead subexpr begin\n";
 
  537       case _S_token_subexpr_end:
 
  538    ostr << "subexpr end\n";
 
  540       case _S_token_unknown:
 
  541    ostr << "-- unknown token --\n";
 
  543       case _S_token_oct_num:
 
  544    ostr << "oct number " << _M_value << "\n";
 
  546       case _S_token_hex_num:
 
  547    ostr << "hex number " << _M_value << "\n";
 
  549       case _S_token_quoted_class:
 
  550    ostr << "quoted class " << "\\" << _M_value << "\n";
 
  553    _GLIBCXX_DEBUG_ASSERT(false);
 
  559 _GLIBCXX_END_NAMESPACE_VERSION
 
  560 } // namespace __detail