libstdc++
regex_scanner.tcc
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_scanner.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // FIXME make comments doxygen format.
32 
33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
34 // and awk
35 // 1) grep is basic except '\n' is treated as '|'
36 // 2) egrep is extended except '\n' is treated as '|'
37 // 3) awk is extended except special escaping rules, and there's no
38 // back-reference.
39 //
40 // References:
41 //
42 // ECMAScript: ECMA-262 15.10
43 //
44 // basic, extended:
45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
46 //
47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
48 
49 namespace std _GLIBCXX_VISIBILITY(default)
50 {
51 namespace __detail
52 {
53 _GLIBCXX_BEGIN_NAMESPACE_VERSION
54 
55  template<typename _CharT>
56  _Scanner<_CharT>::
57  _Scanner(typename _Scanner::_IterT __begin,
58  typename _Scanner::_IterT __end,
59  _FlagT __flags, std::locale __loc)
60  : _ScannerBase(__flags),
61  _M_current(__begin), _M_end(__end),
62  _M_ctype(std::use_facet<_CtypeT>(__loc)),
63  _M_eat_escape(_M_is_ecma()
64  ? &_Scanner::_M_eat_escape_ecma
65  : &_Scanner::_M_eat_escape_posix)
66  { _M_advance(); }
67 
68  template<typename _CharT>
69  void
70  _Scanner<_CharT>::
71  _M_advance()
72  {
73  if (_M_current == _M_end)
74  {
75  _M_token = _S_token_eof;
76  return;
77  }
78 
79  if (_M_state == _S_state_normal)
80  _M_scan_normal();
81  else if (_M_state == _S_state_in_bracket)
82  _M_scan_in_bracket();
83  else if (_M_state == _S_state_in_brace)
84  _M_scan_in_brace();
85  else
86  _GLIBCXX_DEBUG_ASSERT(false);
87  }
88 
89  // Differences between styles:
90  // 1) "\(", "\)", "\{" in basic. It's not escaping.
91  // 2) "(?:", "(?=", "(?!" in ECMAScript.
92  template<typename _CharT>
93  void
94  _Scanner<_CharT>::
95  _M_scan_normal()
96  {
97  auto __c = *_M_current++;
98  const char* __pos;
99 
100  if (__c == '\\')
101  {
102  if (_M_current == _M_end)
103  __throw_regex_error(regex_constants::error_escape);
104 
105  if (!_M_is_basic()
106  || (*_M_current != '('
107  && *_M_current != ')'
108  && *_M_current != '{'))
109  {
110  (this->*_M_eat_escape)();
111  return;
112  }
113  __c = *_M_current++;
114  }
115  if (__c == '(')
116  {
117  if (_M_is_ecma() && *_M_current == '?')
118  {
119  if (++_M_current == _M_end)
120  __throw_regex_error(regex_constants::error_paren);
121 
122  if (*_M_current == ':')
123  {
124  ++_M_current;
125  _M_token = _S_token_subexpr_no_group_begin;
126  }
127  else if (*_M_current == '=')
128  {
129  ++_M_current;
130  _M_token = _S_token_subexpr_lookahead_begin;
131  _M_value.assign(1, 'p');
132  }
133  else if (*_M_current == '!')
134  {
135  ++_M_current;
136  _M_token = _S_token_subexpr_lookahead_begin;
137  _M_value.assign(1, 'n');
138  }
139  else
140  __throw_regex_error(regex_constants::error_paren);
141  }
142  else if (_M_flags & regex_constants::nosubs)
143  _M_token = _S_token_subexpr_no_group_begin;
144  else
145  _M_token = _S_token_subexpr_begin;
146  }
147  else if (__c == ')')
148  _M_token = _S_token_subexpr_end;
149  else if (__c == '[')
150  {
151  _M_state = _S_state_in_bracket;
152  _M_at_bracket_start = true;
153  if (_M_current != _M_end && *_M_current == '^')
154  {
155  _M_token = _S_token_bracket_neg_begin;
156  ++_M_current;
157  }
158  else
159  _M_token = _S_token_bracket_begin;
160  }
161  else if (__c == '{')
162  {
163  _M_state = _S_state_in_brace;
164  _M_token = _S_token_interval_begin;
165  }
166  else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')))
167  != nullptr
168  && *__pos != '\0'
169  && __c != ']'
170  && __c != '}')
171  || (_M_is_grep() && __c == '\n'))
172  {
173  auto __it = _M_token_tbl;
174  auto __narrowc = _M_ctype.narrow(__c, '\0');
175  for (; __it->first != '\0'; ++__it)
176  if (__it->first == __narrowc)
177  {
178  _M_token = __it->second;
179  return;
180  }
181  _GLIBCXX_DEBUG_ASSERT(false);
182  }
183  else
184  {
185  _M_token = _S_token_ord_char;
186  _M_value.assign(1, __c);
187  }
188  }
189 
190  // Differences between styles:
191  // 1) different semantics of "[]" and "[^]".
192  // 2) Escaping in bracket expr.
193  template<typename _CharT>
194  void
195  _Scanner<_CharT>::
196  _M_scan_in_bracket()
197  {
198  if (_M_current == _M_end)
199  __throw_regex_error(regex_constants::error_brack);
200 
201  auto __c = *_M_current++;
202 
203  if (__c == '[')
204  {
205  if (_M_current == _M_end)
206  __throw_regex_error(regex_constants::error_brack);
207 
208  if (*_M_current == '.')
209  {
210  _M_token = _S_token_collsymbol;
211  _M_eat_class(*_M_current++);
212  }
213  else if (*_M_current == ':')
214  {
215  _M_token = _S_token_char_class_name;
216  _M_eat_class(*_M_current++);
217  }
218  else if (*_M_current == '=')
219  {
220  _M_token = _S_token_equiv_class_name;
221  _M_eat_class(*_M_current++);
222  }
223  else
224  {
225  _M_token = _S_token_ord_char;
226  _M_value.assign(1, __c);
227  }
228  }
229  // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
230  // literally. So "[]]" or "[^]]" is valid regex. See the testcases
231  // `*/empty_range.cc`.
232  else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
233  {
234  _M_token = _S_token_bracket_end;
235  _M_state = _S_state_normal;
236  }
237  // ECMAScirpt and awk permmits escaping in bracket.
238  else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
239  (this->*_M_eat_escape)();
240  else
241  {
242  _M_token = _S_token_ord_char;
243  _M_value.assign(1, __c);
244  }
245  _M_at_bracket_start = false;
246  }
247 
248  // Differences between styles:
249  // 1) "\}" in basic style.
250  template<typename _CharT>
251  void
252  _Scanner<_CharT>::
253  _M_scan_in_brace()
254  {
255  if (_M_current == _M_end)
256  __throw_regex_error(regex_constants::error_brace);
257 
258  auto __c = *_M_current++;
259 
260  if (_M_ctype.is(_CtypeT::digit, __c))
261  {
262  _M_token = _S_token_dup_count;
263  _M_value.assign(1, __c);
264  while (_M_current != _M_end
265  && _M_ctype.is(_CtypeT::digit, *_M_current))
266  _M_value += *_M_current++;
267  }
268  else if (__c == ',')
269  _M_token = _S_token_comma;
270  // basic use \}.
271  else if (_M_is_basic())
272  {
273  if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
274  {
275  _M_state = _S_state_normal;
276  _M_token = _S_token_interval_end;
277  ++_M_current;
278  }
279  else
280  __throw_regex_error(regex_constants::error_badbrace);
281  }
282  else if (__c == '}')
283  {
284  _M_state = _S_state_normal;
285  _M_token = _S_token_interval_end;
286  }
287  else
288  __throw_regex_error(regex_constants::error_badbrace);
289  }
290 
291  template<typename _CharT>
292  void
293  _Scanner<_CharT>::
294  _M_eat_escape_ecma()
295  {
296  if (_M_current == _M_end)
297  __throw_regex_error(regex_constants::error_escape);
298 
299  auto __c = *_M_current++;
300  auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
301 
302  if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
303  {
304  _M_token = _S_token_ord_char;
305  _M_value.assign(1, *__pos);
306  }
307  else if (__c == 'b')
308  {
309  _M_token = _S_token_word_bound;
310  _M_value.assign(1, 'p');
311  }
312  else if (__c == 'B')
313  {
314  _M_token = _S_token_word_bound;
315  _M_value.assign(1, 'n');
316  }
317  // N3376 28.13
318  else if (__c == 'd'
319  || __c == 'D'
320  || __c == 's'
321  || __c == 'S'
322  || __c == 'w'
323  || __c == 'W')
324  {
325  _M_token = _S_token_quoted_class;
326  _M_value.assign(1, __c);
327  }
328  else if (__c == 'c')
329  {
330  if (_M_current == _M_end)
331  __throw_regex_error(regex_constants::error_escape);
332  _M_token = _S_token_ord_char;
333  _M_value.assign(1, *_M_current++);
334  }
335  else if (__c == 'x' || __c == 'u')
336  {
337  _M_value.erase();
338  for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
339  {
340  if (_M_current == _M_end
341  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
342  __throw_regex_error(regex_constants::error_escape);
343  _M_value += *_M_current++;
344  }
345  _M_token = _S_token_hex_num;
346  }
347  // ECMAScript recongnizes multi-digit back-references.
348  else if (_M_ctype.is(_CtypeT::digit, __c))
349  {
350  _M_value.assign(1, __c);
351  while (_M_current != _M_end
352  && _M_ctype.is(_CtypeT::digit, *_M_current))
353  _M_value += *_M_current++;
354  _M_token = _S_token_backref;
355  }
356  else
357  {
358  _M_token = _S_token_ord_char;
359  _M_value.assign(1, __c);
360  }
361  }
362 
363  // Differences between styles:
364  // 1) Extended doesn't support backref, but basic does.
365  template<typename _CharT>
366  void
367  _Scanner<_CharT>::
368  _M_eat_escape_posix()
369  {
370  if (_M_current == _M_end)
371  __throw_regex_error(regex_constants::error_escape);
372 
373  auto __c = *_M_current;
374  auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
375 
376  if (__pos != nullptr && *__pos != '\0')
377  {
378  _M_token = _S_token_ord_char;
379  _M_value.assign(1, __c);
380  }
381  // We MUST judge awk before handling backrefs. There's no backref in awk.
382  else if (_M_is_awk())
383  {
384  _M_eat_escape_awk();
385  return;
386  }
387  else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
388  {
389  _M_token = _S_token_backref;
390  _M_value.assign(1, __c);
391  }
392  else
393  {
394 #ifdef __STRICT_ANSI__
395  __throw_regex_error(regex_constants::error_escape);
396 #else
397  _M_token = _S_token_ord_char;
398  _M_value.assign(1, __c);
399 #endif
400  }
401  ++_M_current;
402  }
403 
404  template<typename _CharT>
405  void
406  _Scanner<_CharT>::
407  _M_eat_escape_awk()
408  {
409  auto __c = *_M_current++;
410  auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
411 
412  if (__pos != nullptr)
413  {
414  _M_token = _S_token_ord_char;
415  _M_value.assign(1, *__pos);
416  }
417  // \ddd for oct representation
418  else if (_M_ctype.is(_CtypeT::digit, __c)
419  && __c != '8'
420  && __c != '9')
421  {
422  _M_value.assign(1, __c);
423  for (int __i = 0;
424  __i < 2
425  && _M_current != _M_end
426  && _M_ctype.is(_CtypeT::digit, *_M_current)
427  && *_M_current != '8'
428  && *_M_current != '9';
429  __i++)
430  _M_value += *_M_current++;
431  _M_token = _S_token_oct_num;
432  return;
433  }
434  else
435  __throw_regex_error(regex_constants::error_escape);
436  }
437 
438  // Eats a character class or throwns an exception.
439  // __ch cound be ':', '.' or '=', _M_current is the char after ']' when
440  // returning.
441  template<typename _CharT>
442  void
443  _Scanner<_CharT>::
444  _M_eat_class(char __ch)
445  {
446  for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
447  _M_value += *_M_current++;
448  if (_M_current == _M_end
449  || *_M_current++ != __ch
450  || _M_current == _M_end // skip __ch
451  || *_M_current++ != ']') // skip ']'
452  {
453  if (__ch == ':')
454  __throw_regex_error(regex_constants::error_ctype);
455  else
456  __throw_regex_error(regex_constants::error_collate);
457  }
458  }
459 
460 #ifdef _GLIBCXX_DEBUG
461  template<typename _CharT>
462  std::ostream&
463  _Scanner<_CharT>::
464  _M_print(std::ostream& ostr)
465  {
466  switch (_M_token)
467  {
468  case _S_token_anychar:
469  ostr << "any-character\n";
470  break;
471  case _S_token_backref:
472  ostr << "backref\n";
473  break;
474  case _S_token_bracket_begin:
475  ostr << "bracket-begin\n";
476  break;
477  case _S_token_bracket_neg_begin:
478  ostr << "bracket-neg-begin\n";
479  break;
480  case _S_token_bracket_end:
481  ostr << "bracket-end\n";
482  break;
483  case _S_token_char_class_name:
484  ostr << "char-class-name \"" << _M_value << "\"\n";
485  break;
486  case _S_token_closure0:
487  ostr << "closure0\n";
488  break;
489  case _S_token_closure1:
490  ostr << "closure1\n";
491  break;
492  case _S_token_collsymbol:
493  ostr << "collsymbol \"" << _M_value << "\"\n";
494  break;
495  case _S_token_comma:
496  ostr << "comma\n";
497  break;
498  case _S_token_dup_count:
499  ostr << "dup count: " << _M_value << "\n";
500  break;
501  case _S_token_eof:
502  ostr << "EOF\n";
503  break;
504  case _S_token_equiv_class_name:
505  ostr << "equiv-class-name \"" << _M_value << "\"\n";
506  break;
507  case _S_token_interval_begin:
508  ostr << "interval begin\n";
509  break;
510  case _S_token_interval_end:
511  ostr << "interval end\n";
512  break;
513  case _S_token_line_begin:
514  ostr << "line begin\n";
515  break;
516  case _S_token_line_end:
517  ostr << "line end\n";
518  break;
519  case _S_token_opt:
520  ostr << "opt\n";
521  break;
522  case _S_token_or:
523  ostr << "or\n";
524  break;
525  case _S_token_ord_char:
526  ostr << "ordinary character: \"" << _M_value << "\"\n";
527  break;
528  case _S_token_subexpr_begin:
529  ostr << "subexpr begin\n";
530  break;
531  case _S_token_subexpr_no_group_begin:
532  ostr << "no grouping subexpr begin\n";
533  break;
534  case _S_token_subexpr_lookahead_begin:
535  ostr << "lookahead subexpr begin\n";
536  break;
537  case _S_token_subexpr_end:
538  ostr << "subexpr end\n";
539  break;
540  case _S_token_unknown:
541  ostr << "-- unknown token --\n";
542  break;
543  case _S_token_oct_num:
544  ostr << "oct number " << _M_value << "\n";
545  break;
546  case _S_token_hex_num:
547  ostr << "hex number " << _M_value << "\n";
548  break;
549  case _S_token_quoted_class:
550  ostr << "quoted class " << "\\" << _M_value << "\n";
551  break;
552  default:
553  _GLIBCXX_DEBUG_ASSERT(false);
554  }
555  return ostr;
556  }
557 #endif
558 
559 _GLIBCXX_END_NAMESPACE_VERSION
560 } // namespace __detail
561 } // namespace