libstdc++
regex.tcc
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // See below __regex_algo_impl to get what this is talking about. The default
32 // value 1 indicated a conservative optimization without giving up worst case
33 // performance.
34 #ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
35 #define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
36 #endif
37 
38 namespace std _GLIBCXX_VISIBILITY(default)
39 {
40 namespace __detail
41 {
42 _GLIBCXX_BEGIN_NAMESPACE_VERSION
43 
44  // Result of merging regex_match and regex_search.
45  //
46  // __policy now can be _S_auto (auto dispatch) and _S_alternate (use
47  // the other one if possible, for test purpose).
48  //
49  // That __match_mode is true means regex_match, else regex_search.
50  template<typename _BiIter, typename _Alloc,
51  typename _CharT, typename _TraitsT,
52  _RegexExecutorPolicy __policy,
53  bool __match_mode>
54  bool
55  __regex_algo_impl(_BiIter __s,
56  _BiIter __e,
57  match_results<_BiIter, _Alloc>& __m,
58  const basic_regex<_CharT, _TraitsT>& __re,
59  regex_constants::match_flag_type __flags)
60  {
61  if (__re._M_automaton == nullptr)
62  return false;
63 
64  typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m;
65  __m._M_begin = __s;
66  __res.resize(__re._M_automaton->_M_sub_count() + 2);
67  for (auto& __it : __res)
68  __it.matched = false;
69 
70  // This function decide which executor to use under given circumstances.
71  // The _S_auto policy now is the following: if a NFA has no
72  // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
73  // quantifiers (*, +, ?), the BFS executor will be used, other wise
74  // DFS executor. This is because DFS executor has a exponential upper
75  // bound, but better best-case performace. Meanwhile, BFS executor can
76  // effectively prevent from exponential-long time matching (which must
77  // contains many quantifiers), but it's slower in average.
78  //
79  // For simple regex, BFS executor could be 2 or more times slower than
80  // DFS executor.
81  //
82  // Of course, BFS executor cannot handle back-references.
83  bool __ret;
84  if (!__re._M_automaton->_M_has_backref
85  && (__policy == _RegexExecutorPolicy::_S_alternate
86  || __re._M_automaton->_M_quant_count
87  > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
88  {
89  _Executor<_BiIter, _Alloc, _TraitsT, false>
90  __executor(__s, __e, __m, __re, __flags);
91  if (__match_mode)
92  __ret = __executor._M_match();
93  else
94  __ret = __executor._M_search();
95  }
96  else
97  {
98  _Executor<_BiIter, _Alloc, _TraitsT, true>
99  __executor(__s, __e, __m, __re, __flags);
100  if (__match_mode)
101  __ret = __executor._M_match();
102  else
103  __ret = __executor._M_search();
104  }
105  if (__ret)
106  {
107  for (auto __it : __res)
108  if (!__it.matched)
109  __it.first = __it.second = __e;
110  auto& __pre = __res[__res.size()-2];
111  auto& __suf = __res[__res.size()-1];
112  if (__match_mode)
113  {
114  __pre.matched = false;
115  __pre.first = __s;
116  __pre.second = __s;
117  __suf.matched = false;
118  __suf.first = __e;
119  __suf.second = __e;
120  }
121  else
122  {
123  __pre.first = __s;
124  __pre.second = __res[0].first;
125  __pre.matched = (__pre.first != __pre.second);
126  __suf.first = __res[0].second;
127  __suf.second = __e;
128  __suf.matched = (__suf.first != __suf.second);
129  }
130  }
131  return __ret;
132  }
133 
134 _GLIBCXX_END_NAMESPACE_VERSION
135 }
136 
137 _GLIBCXX_BEGIN_NAMESPACE_VERSION
138 
139  template<typename _Ch_type>
140  template<typename _Fwd_iter>
141  typename regex_traits<_Ch_type>::string_type
142  regex_traits<_Ch_type>::
143  lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
144  {
145  typedef std::ctype<char_type> __ctype_type;
146  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
147 
148  static const char* __collatenames[] =
149  {
150  "NUL",
151  "SOH",
152  "STX",
153  "ETX",
154  "EOT",
155  "ENQ",
156  "ACK",
157  "alert",
158  "backspace",
159  "tab",
160  "newline",
161  "vertical-tab",
162  "form-feed",
163  "carriage-return",
164  "SO",
165  "SI",
166  "DLE",
167  "DC1",
168  "DC2",
169  "DC3",
170  "DC4",
171  "NAK",
172  "SYN",
173  "ETB",
174  "CAN",
175  "EM",
176  "SUB",
177  "ESC",
178  "IS4",
179  "IS3",
180  "IS2",
181  "IS1",
182  "space",
183  "exclamation-mark",
184  "quotation-mark",
185  "number-sign",
186  "dollar-sign",
187  "percent-sign",
188  "ampersand",
189  "apostrophe",
190  "left-parenthesis",
191  "right-parenthesis",
192  "asterisk",
193  "plus-sign",
194  "comma",
195  "hyphen",
196  "period",
197  "slash",
198  "zero",
199  "one",
200  "two",
201  "three",
202  "four",
203  "five",
204  "six",
205  "seven",
206  "eight",
207  "nine",
208  "colon",
209  "semicolon",
210  "less-than-sign",
211  "equals-sign",
212  "greater-than-sign",
213  "question-mark",
214  "commercial-at",
215  "A",
216  "B",
217  "C",
218  "D",
219  "E",
220  "F",
221  "G",
222  "H",
223  "I",
224  "J",
225  "K",
226  "L",
227  "M",
228  "N",
229  "O",
230  "P",
231  "Q",
232  "R",
233  "S",
234  "T",
235  "U",
236  "V",
237  "W",
238  "X",
239  "Y",
240  "Z",
241  "left-square-bracket",
242  "backslash",
243  "right-square-bracket",
244  "circumflex",
245  "underscore",
246  "grave-accent",
247  "a",
248  "b",
249  "c",
250  "d",
251  "e",
252  "f",
253  "g",
254  "h",
255  "i",
256  "j",
257  "k",
258  "l",
259  "m",
260  "n",
261  "o",
262  "p",
263  "q",
264  "r",
265  "s",
266  "t",
267  "u",
268  "v",
269  "w",
270  "x",
271  "y",
272  "z",
273  "left-curly-bracket",
274  "vertical-line",
275  "right-curly-bracket",
276  "tilde",
277  "DEL",
278  ""
279  };
280 
281  // same as boost
282  //static const char* __digraphs[] =
283  // {
284  // "ae",
285  // "Ae",
286  // "AE",
287  // "ch",
288  // "Ch",
289  // "CH",
290  // "ll",
291  // "Ll",
292  // "LL",
293  // "ss",
294  // "Ss",
295  // "SS",
296  // "nj",
297  // "Nj",
298  // "NJ",
299  // "dz",
300  // "Dz",
301  // "DZ",
302  // "lj",
303  // "Lj",
304  // "LJ",
305  // ""
306  // };
307 
308  std::string __s(__last - __first, '?');
309  __fctyp.narrow(__first, __last, '?', &*__s.begin());
310 
311  for (unsigned int __i = 0; *__collatenames[__i]; __i++)
312  if (__s == __collatenames[__i])
313  return string_type(1, __fctyp.widen(static_cast<char>(__i)));
314 
315  //for (unsigned int __i = 0; *__digraphs[__i]; __i++)
316  // {
317  // const char* __now = __digraphs[__i];
318  // if (__s == __now)
319  // {
320  // string_type ret(__s.size(), __fctyp.widen('?'));
321  // __fctyp.widen(__now, __now + 2/* ouch */, &*ret.begin());
322  // return ret;
323  // }
324  // }
325  return string_type();
326  }
327 
328  template<typename _Ch_type>
329  template<typename _Fwd_iter>
330  typename regex_traits<_Ch_type>::char_class_type
331  regex_traits<_Ch_type>::
332  lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
333  {
334  typedef std::ctype<char_type> __ctype_type;
335  typedef std::ctype<char> __cctype_type;
336  typedef const pair<const char*, char_class_type> _ClassnameEntry;
337  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
338  const __cctype_type& __cctyp(use_facet<__cctype_type>(_M_locale));
339 
340  static _ClassnameEntry __classnames[] =
341  {
342  {"d", ctype_base::digit},
343  {"w", {ctype_base::alnum, _RegexMask::_S_under}},
344  {"s", ctype_base::space},
345  {"alnum", ctype_base::alnum},
346  {"alpha", ctype_base::alpha},
347  {"blank", {0, _RegexMask::_S_blank}},
348  {"cntrl", ctype_base::cntrl},
349  {"digit", ctype_base::digit},
350  {"graph", ctype_base::graph},
351  {"lower", ctype_base::lower},
352  {"print", ctype_base::print},
353  {"punct", ctype_base::punct},
354  {"space", ctype_base::space},
355  {"upper", ctype_base::upper},
356  {"xdigit", ctype_base::xdigit},
357  };
358 
359  std::string __s(__last - __first, '?');
360  __fctyp.narrow(__first, __last, '?', &__s[0]);
361  __cctyp.tolower(&*__s.begin(), &*__s.begin() + __s.size());
362  for (_ClassnameEntry* __it = __classnames;
363  __it < *(&__classnames + 1);
364  ++__it)
365  {
366  if (__s == __it->first)
367  {
368  if (__icase
369  && ((__it->second
370  & (ctype_base::lower | ctype_base::upper)) != 0))
371  return ctype_base::alpha;
372  return __it->second;
373  }
374  }
375  return 0;
376  }
377 
378  template<typename _Ch_type>
379  bool
380  regex_traits<_Ch_type>::
381  isctype(_Ch_type __c, char_class_type __f) const
382  {
383  typedef std::ctype<char_type> __ctype_type;
384  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
385 
386  return __fctyp.is(__f._M_base, __c)
387  // [[:w:]]
388  || ((__f._M_extended & _RegexMask::_S_under)
389  && __c == __fctyp.widen('_'))
390  // [[:blank:]]
391  || ((__f._M_extended & _RegexMask::_S_blank)
392  && (__c == __fctyp.widen(' ')
393  || __c == __fctyp.widen('\t')));
394  }
395 
396  template<typename _Ch_type>
397  int
398  regex_traits<_Ch_type>::
399  value(_Ch_type __ch, int __radix) const
400  {
401  std::basic_istringstream<char_type> __is(string_type(1, __ch));
402  long __v;
403  if (__radix == 8)
404  __is >> std::oct;
405  else if (__radix == 16)
406  __is >> std::hex;
407  __is >> __v;
408  return __is.fail() ? -1 : __v;
409  }
410 
411  template<typename _Bi_iter, typename _Alloc>
412  template<typename _Out_iter>
413  _Out_iter match_results<_Bi_iter, _Alloc>::
414  format(_Out_iter __out,
415  const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first,
416  const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last,
417  match_flag_type __flags) const
418  {
419  _GLIBCXX_DEBUG_ASSERT( ready() );
420  regex_traits<char_type> __traits;
421  typedef std::ctype<char_type> __ctype_type;
422  const __ctype_type&
423  __fctyp(use_facet<__ctype_type>(__traits.getloc()));
424 
425  auto __output = [&](size_t __idx)
426  {
427  auto& __sub = _Base_type::operator[](__idx);
428  if (__sub.matched)
429  __out = std::copy(__sub.first, __sub.second, __out);
430  };
431 
432  if (__flags & regex_constants::format_sed)
433  {
434  for (; __fmt_first != __fmt_last;)
435  if (*__fmt_first == '&')
436  {
437  __output(0);
438  ++__fmt_first;
439  }
440  else if (*__fmt_first == '\\')
441  {
442  if (++__fmt_first != __fmt_last
443  && __fctyp.is(__ctype_type::digit, *__fmt_first))
444  __output(__traits.value(*__fmt_first++, 10));
445  else
446  *__out++ = '\\';
447  }
448  else
449  *__out++ = *__fmt_first++;
450  }
451  else
452  {
453  while (1)
454  {
455  auto __next = std::find(__fmt_first, __fmt_last, '$');
456  if (__next == __fmt_last)
457  break;
458 
459  __out = std::copy(__fmt_first, __next, __out);
460 
461  auto __eat = [&](char __ch) -> bool
462  {
463  if (*__next == __ch)
464  {
465  ++__next;
466  return true;
467  }
468  return false;
469  };
470 
471  if (++__next == __fmt_last)
472  *__out++ = '$';
473  else if (__eat('$'))
474  *__out++ = '$';
475  else if (__eat('&'))
476  __output(0);
477  else if (__eat('`'))
478  __output(_Base_type::size()-2);
479  else if (__eat('\''))
480  __output(_Base_type::size()-1);
481  else if (__fctyp.is(__ctype_type::digit, *__next))
482  {
483  long __num = __traits.value(*__next, 10);
484  if (++__next != __fmt_last
485  && __fctyp.is(__ctype_type::digit, *__next))
486  {
487  __num *= 10;
488  __num += __traits.value(*__next++, 10);
489  }
490  if (0 <= __num && __num < this->size())
491  __output(__num);
492  }
493  else
494  *__out++ = '$';
495  __fmt_first = __next;
496  }
497  __out = std::copy(__fmt_first, __fmt_last, __out);
498  }
499  return __out;
500  }
501 
502  template<typename _Out_iter, typename _Bi_iter,
503  typename _Rx_traits, typename _Ch_type>
504  _Out_iter
505  regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last,
506  const basic_regex<_Ch_type, _Rx_traits>& __e,
507  const _Ch_type* __fmt,
508  regex_constants::match_flag_type __flags)
509  {
510  typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT;
511  _IterT __i(__first, __last, __e, __flags);
512  _IterT __end;
513  if (__i == __end)
514  {
515  if (!(__flags & regex_constants::format_no_copy))
516  __out = std::copy(__first, __last, __out);
517  }
518  else
519  {
520  sub_match<_Bi_iter> __last;
521  auto __len = char_traits<_Ch_type>::length(__fmt);
522  for (; __i != __end; ++__i)
523  {
524  if (!(__flags & regex_constants::format_no_copy))
525  __out = std::copy(__i->prefix().first, __i->prefix().second,
526  __out);
527  __out = __i->format(__out, __fmt, __fmt + __len, __flags);
528  __last = __i->suffix();
529  if (__flags & regex_constants::format_first_only)
530  break;
531  }
532  if (!(__flags & regex_constants::format_no_copy))
533  __out = std::copy(__last.first, __last.second, __out);
534  }
535  return __out;
536  }
537 
538  template<typename _Bi_iter,
539  typename _Ch_type,
540  typename _Rx_traits>
541  bool
542  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
543  operator==(const regex_iterator& __rhs) const
544  {
545  return (_M_match.empty() && __rhs._M_match.empty())
546  || (_M_begin == __rhs._M_begin
547  && _M_end == __rhs._M_end
548  && _M_pregex == __rhs._M_pregex
549  && _M_flags == __rhs._M_flags
550  && _M_match[0] == __rhs._M_match[0]);
551  }
552 
553  template<typename _Bi_iter,
554  typename _Ch_type,
555  typename _Rx_traits>
556  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
557  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
558  operator++()
559  {
560  // In all cases in which the call to regex_search returns true,
561  // match.prefix().first shall be equal to the previous value of
562  // match[0].second, and for each index i in the half-open range
563  // [0, match.size()) for which match[i].matched is true,
564  // match[i].position() shall return distance(begin, match[i].first).
565  // [28.12.1.4.5]
566  if (_M_match[0].matched)
567  {
568  auto __start = _M_match[0].second;
569  auto __prefix_first = _M_match[0].second;
570  if (_M_match[0].first == _M_match[0].second)
571  {
572  if (__start == _M_end)
573  {
574  _M_match = value_type();
575  return *this;
576  }
577  else
578  {
579  if (regex_search(__start, _M_end, _M_match, *_M_pregex,
580  _M_flags
581  | regex_constants::match_not_null
582  | regex_constants::match_continuous))
583  {
584  _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
585  auto& __prefix = _M_match.at(_M_match.size());
586  __prefix.first = __prefix_first;
587  __prefix.matched = __prefix.first != __prefix.second;
588  // [28.12.1.4.5]
589  _M_match._M_begin = _M_begin;
590  return *this;
591  }
592  else
593  ++__start;
594  }
595  }
596  _M_flags |= regex_constants::match_prev_avail;
597  if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags))
598  {
599  _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
600  auto& __prefix = _M_match.at(_M_match.size());
601  __prefix.first = __prefix_first;
602  __prefix.matched = __prefix.first != __prefix.second;
603  // [28.12.1.4.5]
604  _M_match._M_begin = _M_begin;
605  }
606  else
607  _M_match = value_type();
608  }
609  return *this;
610  }
611 
612  template<typename _Bi_iter,
613  typename _Ch_type,
614  typename _Rx_traits>
615  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
616  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
617  operator=(const regex_token_iterator& __rhs)
618  {
619  _M_position = __rhs._M_position;
620  _M_subs = __rhs._M_subs;
621  _M_n = __rhs._M_n;
622  _M_suffix = __rhs._M_suffix;
623  _M_has_m1 = __rhs._M_has_m1;
624  _M_normalize_result();
625  return *this;
626  }
627 
628  template<typename _Bi_iter,
629  typename _Ch_type,
630  typename _Rx_traits>
631  bool
632  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
633  operator==(const regex_token_iterator& __rhs) const
634  {
635  if (_M_end_of_seq() && __rhs._M_end_of_seq())
636  return true;
637  if (_M_suffix.matched && __rhs._M_suffix.matched
638  && _M_suffix == __rhs._M_suffix)
639  return true;
640  if (_M_end_of_seq() || _M_suffix.matched
641  || __rhs._M_end_of_seq() || __rhs._M_suffix.matched)
642  return false;
643  return _M_position == __rhs._M_position
644  && _M_n == __rhs._M_n
645  && _M_subs == __rhs._M_subs;
646  }
647 
648  template<typename _Bi_iter,
649  typename _Ch_type,
650  typename _Rx_traits>
651  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
652  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
653  operator++()
654  {
655  _Position __prev = _M_position;
656  if (_M_suffix.matched)
657  *this = regex_token_iterator();
658  else if (_M_n + 1 < _M_subs.size())
659  {
660  _M_n++;
661  _M_result = &_M_current_match();
662  }
663  else
664  {
665  _M_n = 0;
666  ++_M_position;
667  if (_M_position != _Position())
668  _M_result = &_M_current_match();
669  else if (_M_has_m1 && __prev->suffix().length() != 0)
670  {
671  _M_suffix.matched = true;
672  _M_suffix.first = __prev->suffix().first;
673  _M_suffix.second = __prev->suffix().second;
674  _M_result = &_M_suffix;
675  }
676  else
677  *this = regex_token_iterator();
678  }
679  return *this;
680  }
681 
682  template<typename _Bi_iter,
683  typename _Ch_type,
684  typename _Rx_traits>
685  void
686  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
687  _M_init(_Bi_iter __a, _Bi_iter __b)
688  {
689  _M_has_m1 = false;
690  for (auto __it : _M_subs)
691  if (__it == -1)
692  {
693  _M_has_m1 = true;
694  break;
695  }
696  if (_M_position != _Position())
697  _M_result = &_M_current_match();
698  else if (_M_has_m1)
699  {
700  _M_suffix.matched = true;
701  _M_suffix.first = __a;
702  _M_suffix.second = __b;
703  _M_result = &_M_suffix;
704  }
705  else
706  _M_result = nullptr;
707  }
708 
709 _GLIBCXX_END_NAMESPACE_VERSION
710 } // namespace
711