1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
21 /* This is currently duplicated from git-compat-utils.h */
23 typedef long intptr_t;
24 typedef unsigned long uintptr_t;
28 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
29 size_t length, reg_syntax_t syntax);
30 static void re_compile_fastmap_iter (regex_t *bufp,
31 const re_dfastate_t *init_state,
33 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
35 static void free_charset (re_charset_t *cset);
36 #endif /* RE_ENABLE_I18N */
37 static void free_workarea_compile (regex_t *preg);
38 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
40 static void optimize_utf8 (re_dfa_t *dfa);
42 static reg_errcode_t analyze (regex_t *preg);
43 static reg_errcode_t preorder (bin_tree_t *root,
44 reg_errcode_t (fn (void *, bin_tree_t *)),
46 static reg_errcode_t postorder (bin_tree_t *root,
47 reg_errcode_t (fn (void *, bin_tree_t *)),
49 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
50 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
51 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
53 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
54 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
55 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
56 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
57 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
58 unsigned int constraint);
59 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
60 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
62 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
63 static int fetch_number (re_string_t *input, re_token_t *token,
65 static int peek_token (re_token_t *token, re_string_t *input,
66 reg_syntax_t syntax) internal_function;
67 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
68 reg_syntax_t syntax, reg_errcode_t *err);
69 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 int nest, reg_errcode_t *err);
72 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
76 re_token_t *token, reg_syntax_t syntax,
77 int nest, reg_errcode_t *err);
78 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
79 re_token_t *token, reg_syntax_t syntax,
80 int nest, reg_errcode_t *err);
81 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
82 re_dfa_t *dfa, re_token_t *token,
83 reg_syntax_t syntax, reg_errcode_t *err);
84 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
85 re_token_t *token, reg_syntax_t syntax,
87 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
89 re_token_t *token, int token_len,
93 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97 static reg_errcode_t build_equiv_class (bitset_t sbcset,
99 int *equiv_class_alloc,
100 const unsigned char *name);
101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 re_charset_t *mbcset,
104 int *char_class_alloc,
105 const char *class_name,
106 reg_syntax_t syntax);
107 #else /* not RE_ENABLE_I18N */
108 static reg_errcode_t build_equiv_class (bitset_t sbcset,
109 const unsigned char *name);
110 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
112 const char *class_name,
113 reg_syntax_t syntax);
114 #endif /* not RE_ENABLE_I18N */
115 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
116 RE_TRANSLATE_TYPE trans,
117 const char *class_name,
119 int non_match, reg_errcode_t *err);
120 static bin_tree_t *create_tree (re_dfa_t *dfa,
121 bin_tree_t *left, bin_tree_t *right,
122 re_token_type_t type);
123 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
124 bin_tree_t *left, bin_tree_t *right,
125 const re_token_t *token);
126 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
127 static void free_token (re_token_t *node);
128 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
129 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
131 /* This table gives an error message for each of the error codes listed
132 in regex.h. Obviously the order here has to be same as there.
133 POSIX doesn't require that we do anything for REG_NOERROR,
134 but why not be nice? */
136 const char __re_error_msgid[] attribute_hidden =
138 #define REG_NOERROR_IDX 0
139 gettext_noop ("Success") /* REG_NOERROR */
141 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
142 gettext_noop ("No match") /* REG_NOMATCH */
144 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
145 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
147 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
148 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
150 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
151 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
153 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
154 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
156 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
157 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
159 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
160 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
162 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
163 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
165 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
166 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
168 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
169 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
171 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
172 gettext_noop ("Invalid range end") /* REG_ERANGE */
174 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
175 gettext_noop ("Memory exhausted") /* REG_ESPACE */
177 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
178 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
180 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
181 gettext_noop ("Premature end of regular expression") /* REG_EEND */
183 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
184 gettext_noop ("Regular expression too big") /* REG_ESIZE */
186 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
187 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
190 const size_t __re_error_msgid_idx[] attribute_hidden =
211 /* Entry points for GNU code. */
216 /* For ZOS USS we must define btowc */
227 mbtowc (wtmp, tmp, 1);
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
240 re_compile_pattern (const char *pattern,
242 struct re_pattern_buffer *bufp)
246 /* And GNU code determines whether or not to get register information
247 by passing null for the REGS argument to re_match, etc., not by
248 setting no_sub, unless RE_NO_SUB is set. */
249 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
251 /* Match anchors at newline. */
252 bufp->newline_anchor = 1;
254 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
258 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
261 weak_alias (__re_compile_pattern, re_compile_pattern)
264 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
265 also be assigned to arbitrarily: each pattern buffer stores its own
266 syntax, so it can be changed between regex compilations. */
267 /* This has no initializer because initialized variables in Emacs
268 become read-only after dumping. */
269 reg_syntax_t re_syntax_options;
272 /* Specify the precise syntax of regexps for compilation. This provides
273 for compatibility for various utilities which historically have
274 different, incompatible syntaxes.
276 The argument SYNTAX is a bit mask comprised of the various bits
277 defined in regex.h. We return the old syntax. */
280 re_set_syntax (reg_syntax_t syntax)
282 reg_syntax_t ret = re_syntax_options;
284 re_syntax_options = syntax;
288 weak_alias (__re_set_syntax, re_set_syntax)
292 re_compile_fastmap (struct re_pattern_buffer *bufp)
294 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
295 char *fastmap = bufp->fastmap;
297 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
298 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
299 if (dfa->init_state != dfa->init_state_word)
300 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
301 if (dfa->init_state != dfa->init_state_nl)
302 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
303 if (dfa->init_state != dfa->init_state_begbuf)
304 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
305 bufp->fastmap_accurate = 1;
309 weak_alias (__re_compile_fastmap, re_compile_fastmap)
313 __attribute ((always_inline))
314 re_set_fastmap (char *fastmap, int icase, int ch)
318 fastmap[tolower (ch)] = 1;
321 /* Helper function for re_compile_fastmap.
322 Compile fastmap for the initial_state INIT_STATE. */
325 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
328 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
330 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
331 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
333 int node = init_state->nodes.elems[node_cnt];
334 re_token_type_t type = dfa->nodes[node].type;
336 if (type == CHARACTER)
338 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
339 #ifdef RE_ENABLE_I18N
340 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
342 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
347 *p++ = dfa->nodes[node].opr.c;
348 while (++node < dfa->nodes_len
349 && dfa->nodes[node].type == CHARACTER
350 && dfa->nodes[node].mb_partial)
351 *p++ = dfa->nodes[node].opr.c;
352 memset (&state, '\0', sizeof (state));
353 if (__mbrtowc (&wc, (const char *) buf, p - buf,
355 && (__wcrtomb ((char *) buf, towlower (wc), &state)
357 re_set_fastmap (fastmap, 0, buf[0]);
362 else if (type == SIMPLE_BRACKET)
365 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
368 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
369 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
370 if (w & ((bitset_word_t) 1 << j))
371 re_set_fastmap (fastmap, icase, ch);
374 #ifdef RE_ENABLE_I18N
375 else if (type == COMPLEX_BRACKET)
377 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
381 /* See if we have to try all bytes which start multiple collation
383 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
384 collation element, and don't catch 'b' since 'b' is
385 the only collation element which starts from 'b' (and
386 it is caught by SIMPLE_BRACKET). */
387 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
388 && (cset->ncoll_syms || cset->nranges))
390 const int32_t *table = (const int32_t *)
391 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
392 for (i = 0; i < SBC_MAX; ++i)
394 re_set_fastmap (fastmap, icase, i);
398 /* See if we have to start the match at all multibyte characters,
399 i.e. where we would not find an invalid sequence. This only
400 applies to multibyte character sets; for single byte character
401 sets, the SIMPLE_BRACKET again suffices. */
402 if (dfa->mb_cur_max > 1
403 && (cset->nchar_classes || cset->non_match || cset->nranges
405 || cset->nequiv_classes
413 memset (&mbs, 0, sizeof (mbs));
414 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
415 re_set_fastmap (fastmap, false, (int) c);
422 /* ... Else catch all bytes which can start the mbchars. */
423 for (i = 0; i < cset->nmbchars; ++i)
427 memset (&state, '\0', sizeof (state));
428 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
429 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
430 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
432 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
434 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
439 #endif /* RE_ENABLE_I18N */
440 else if (type == OP_PERIOD
441 #ifdef RE_ENABLE_I18N
442 || type == OP_UTF8_PERIOD
443 #endif /* RE_ENABLE_I18N */
444 || type == END_OF_RE)
446 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
447 if (type == END_OF_RE)
448 bufp->can_be_null = 1;
454 /* Entry point for POSIX code. */
455 /* regcomp takes a regular expression as a string and compiles it.
457 PREG is a regex_t *. We do not expect any fields to be initialized,
458 since POSIX says we shouldn't. Thus, we set
460 `buffer' to the compiled pattern;
461 `used' to the length of the compiled pattern;
462 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
463 REG_EXTENDED bit in CFLAGS is set; otherwise, to
464 RE_SYNTAX_POSIX_BASIC;
465 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
466 `fastmap' to an allocated space for the fastmap;
467 `fastmap_accurate' to zero;
468 `re_nsub' to the number of subexpressions in PATTERN.
470 PATTERN is the address of the pattern string.
472 CFLAGS is a series of bits which affect compilation.
474 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
475 use POSIX basic syntax.
477 If REG_NEWLINE is set, then . and [^...] don't match newline.
478 Also, regexec will try a match beginning after every newline.
480 If REG_ICASE is set, then we considers upper- and lowercase
481 versions of letters to be equivalent when matching.
483 If REG_NOSUB is set, then when PREG is passed to regexec, that
484 routine will report only success or failure, and nothing about the
487 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
488 the return codes and their meanings.) */
491 regcomp (regex_t *__restrict preg,
492 const char *__restrict pattern,
496 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
497 : RE_SYNTAX_POSIX_BASIC);
503 /* Try to allocate space for the fastmap. */
504 preg->fastmap = re_malloc (char, SBC_MAX);
505 if (BE (preg->fastmap == NULL, 0))
508 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
510 /* If REG_NEWLINE is set, newlines are treated differently. */
511 if (cflags & REG_NEWLINE)
512 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
513 syntax &= ~RE_DOT_NEWLINE;
514 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
515 /* It also changes the matching behavior. */
516 preg->newline_anchor = 1;
519 preg->newline_anchor = 0;
520 preg->no_sub = !!(cflags & REG_NOSUB);
521 preg->translate = NULL;
523 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
525 /* POSIX doesn't distinguish between an unmatched open-group and an
526 unmatched close-group: both are REG_EPAREN. */
527 if (ret == REG_ERPAREN)
530 /* We have already checked preg->fastmap != NULL. */
531 if (BE (ret == REG_NOERROR, 1))
532 /* Compute the fastmap now, since regexec cannot modify the pattern
533 buffer. This function never fails in this implementation. */
534 (void) re_compile_fastmap (preg);
537 /* Some error occurred while compiling the expression. */
538 re_free (preg->fastmap);
539 preg->fastmap = NULL;
545 weak_alias (__regcomp, regcomp)
548 /* Returns a message corresponding to an error code, ERRCODE, returned
549 from either regcomp or regexec. We don't use PREG here. */
552 regerror(int errcode, const regex_t *__restrict preg,
553 char *__restrict errbuf, size_t errbuf_size)
559 || errcode >= (int) (sizeof (__re_error_msgid_idx)
560 / sizeof (__re_error_msgid_idx[0])), 0))
561 /* Only error codes returned by the rest of the code should be passed
562 to this routine. If we are given anything else, or if other regex
563 code generates an invalid error code, then the program has a bug.
564 Dump core so we can fix it. */
567 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
569 msg_size = strlen (msg) + 1; /* Includes the null. */
571 if (BE (errbuf_size != 0, 1))
573 if (BE (msg_size > errbuf_size, 0))
575 memcpy (errbuf, msg, errbuf_size - 1);
576 errbuf[errbuf_size - 1] = 0;
579 memcpy (errbuf, msg, msg_size);
585 weak_alias (__regerror, regerror)
589 #ifdef RE_ENABLE_I18N
590 /* This static array is used for the map to single-byte characters when
591 UTF-8 is used. Otherwise we would allocate memory just to initialize
592 it the same all the time. UTF-8 is the preferred encoding so this is
593 a worthwhile optimization. */
595 static const bitset_t utf8_sb_map = {
596 /* Set the first 128 bits. */
597 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
599 #else /* ! (__GNUC__ >= 3) */
600 static bitset_t utf8_sb_map;
601 #endif /* __GNUC__ >= 3 */
602 #endif /* RE_ENABLE_I18N */
606 free_dfa_content (re_dfa_t *dfa)
611 for (i = 0; i < dfa->nodes_len; ++i)
612 free_token (dfa->nodes + i);
613 re_free (dfa->nexts);
614 for (i = 0; i < dfa->nodes_len; ++i)
616 if (dfa->eclosures != NULL)
617 re_node_set_free (dfa->eclosures + i);
618 if (dfa->inveclosures != NULL)
619 re_node_set_free (dfa->inveclosures + i);
620 if (dfa->edests != NULL)
621 re_node_set_free (dfa->edests + i);
623 re_free (dfa->edests);
624 re_free (dfa->eclosures);
625 re_free (dfa->inveclosures);
626 re_free (dfa->nodes);
628 if (dfa->state_table)
629 for (i = 0; i <= dfa->state_hash_mask; ++i)
631 struct re_state_table_entry *entry = dfa->state_table + i;
632 for (j = 0; j < entry->num; ++j)
634 re_dfastate_t *state = entry->array[j];
637 re_free (entry->array);
639 re_free (dfa->state_table);
640 #ifdef RE_ENABLE_I18N
641 if (dfa->sb_char != utf8_sb_map)
642 re_free (dfa->sb_char);
644 re_free (dfa->subexp_map);
646 re_free (dfa->re_str);
653 /* Free dynamically allocated space used by PREG. */
656 regfree (regex_t *preg)
658 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
659 if (BE (dfa != NULL, 1))
660 free_dfa_content (dfa);
664 re_free (preg->fastmap);
665 preg->fastmap = NULL;
667 re_free (preg->translate);
668 preg->translate = NULL;
671 weak_alias (__regfree, regfree)
674 /* Entry points compatible with 4.2 BSD regex library. We don't define
675 them unless specifically requested. */
677 #if defined _REGEX_RE_COMP || defined _LIBC
679 /* BSD has one and only one pattern buffer. */
680 static struct re_pattern_buffer re_comp_buf;
684 /* Make these definitions weak in libc, so POSIX programs can redefine
685 these names if they don't use our functions, and still use
686 regcomp/regexec above without link errors. */
697 if (!re_comp_buf.buffer)
698 return gettext ("No previous regular expression");
702 if (re_comp_buf.buffer)
704 fastmap = re_comp_buf.fastmap;
705 re_comp_buf.fastmap = NULL;
706 __regfree (&re_comp_buf);
707 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
708 re_comp_buf.fastmap = fastmap;
711 if (re_comp_buf.fastmap == NULL)
713 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
714 if (re_comp_buf.fastmap == NULL)
715 return (char *) gettext (__re_error_msgid
716 + __re_error_msgid_idx[(int) REG_ESPACE]);
719 /* Since `re_exec' always passes NULL for the `regs' argument, we
720 don't need to initialize the pattern buffer fields which affect it. */
722 /* Match anchors at newlines. */
723 re_comp_buf.newline_anchor = 1;
725 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
730 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
731 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
735 libc_freeres_fn (free_mem)
737 __regfree (&re_comp_buf);
741 #endif /* _REGEX_RE_COMP */
743 /* Internal entry point.
744 Compile the regular expression PATTERN, whose length is LENGTH.
745 SYNTAX indicate regular expression's syntax. */
748 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
751 reg_errcode_t err = REG_NOERROR;
755 /* Initialize the pattern buffer. */
756 preg->fastmap_accurate = 0;
757 preg->syntax = syntax;
758 preg->not_bol = preg->not_eol = 0;
761 preg->can_be_null = 0;
762 preg->regs_allocated = REGS_UNALLOCATED;
764 /* Initialize the dfa. */
765 dfa = (re_dfa_t *) preg->buffer;
766 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
768 /* If zero allocated, but buffer is non-null, try to realloc
769 enough space. This loses if buffer's address is bogus, but
770 that is the user's responsibility. If ->buffer is NULL this
771 is a simple allocation. */
772 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
775 preg->allocated = sizeof (re_dfa_t);
776 preg->buffer = (unsigned char *) dfa;
778 preg->used = sizeof (re_dfa_t);
780 err = init_dfa (dfa, length);
781 if (BE (err != REG_NOERROR, 0))
783 free_dfa_content (dfa);
789 /* Note: length+1 will not overflow since it is checked in init_dfa. */
790 dfa->re_str = re_malloc (char, length + 1);
791 strncpy (dfa->re_str, pattern, length + 1);
794 __libc_lock_init (dfa->lock);
796 err = re_string_construct (®exp, pattern, length, preg->translate,
797 syntax & RE_ICASE, dfa);
798 if (BE (err != REG_NOERROR, 0))
800 re_compile_internal_free_return:
801 free_workarea_compile (preg);
802 re_string_destruct (®exp);
803 free_dfa_content (dfa);
809 /* Parse the regular expression, and build a structure tree. */
811 dfa->str_tree = parse (®exp, preg, syntax, &err);
812 if (BE (dfa->str_tree == NULL, 0))
813 goto re_compile_internal_free_return;
815 /* Analyze the tree and create the nfa. */
816 err = analyze (preg);
817 if (BE (err != REG_NOERROR, 0))
818 goto re_compile_internal_free_return;
820 #ifdef RE_ENABLE_I18N
821 /* If possible, do searching in single byte encoding to speed things up. */
822 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
826 /* Then create the initial state of the dfa. */
827 err = create_initial_state (dfa);
829 /* Release work areas. */
830 free_workarea_compile (preg);
831 re_string_destruct (®exp);
833 if (BE (err != REG_NOERROR, 0))
835 free_dfa_content (dfa);
843 /* Initialize DFA. We use the length of the regular expression PAT_LEN
844 as the initial length of some arrays. */
847 init_dfa (re_dfa_t *dfa, size_t pat_len)
849 unsigned int table_size;
854 memset (dfa, '\0', sizeof (re_dfa_t));
856 /* Force allocation of str_tree_storage the first time. */
857 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
859 /* Avoid overflows. */
860 if (pat_len == SIZE_MAX)
863 dfa->nodes_alloc = pat_len + 1;
864 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
866 /* table_size = 2 ^ ceil(log pat_len) */
867 for (table_size = 1; ; table_size <<= 1)
868 if (table_size > pat_len)
871 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
872 dfa->state_hash_mask = table_size - 1;
874 dfa->mb_cur_max = MB_CUR_MAX;
876 if (dfa->mb_cur_max == 6
877 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
879 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
882 # ifdef HAVE_LANGINFO_CODESET
883 codeset_name = nl_langinfo (CODESET);
885 codeset_name = getenv ("LC_ALL");
886 if (codeset_name == NULL || codeset_name[0] == '\0')
887 codeset_name = getenv ("LC_CTYPE");
888 if (codeset_name == NULL || codeset_name[0] == '\0')
889 codeset_name = getenv ("LANG");
890 if (codeset_name == NULL)
892 else if (strchr (codeset_name, '.') != NULL)
893 codeset_name = strchr (codeset_name, '.') + 1;
896 /* strcasecmp isn't a standard interface. brute force check */
898 if (strcasecmp (codeset_name, "UTF-8") == 0
899 || strcasecmp (codeset_name, "UTF8") == 0)
902 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
903 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
904 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
905 && (codeset_name[3] == '-'
906 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
907 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
911 /* We check exhaustively in the loop below if this charset is a
912 superset of ASCII. */
913 dfa->map_notascii = 0;
916 #ifdef RE_ENABLE_I18N
917 if (dfa->mb_cur_max > 1)
921 #if !defined(__GNUC__) || __GNUC__ < 3
922 static short utf8_sb_map_inited = 0;
924 if (! utf8_sb_map_inited)
928 utf8_sb_map_inited = 0;
929 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
930 utf8_sb_map[i] = BITSET_WORD_MAX;
933 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
939 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
940 if (BE (dfa->sb_char == NULL, 0))
943 /* Set the bits corresponding to single byte chars. */
944 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
945 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
947 wint_t wch = __btowc (ch);
949 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
951 if (isascii (ch) && wch != ch)
952 dfa->map_notascii = 1;
959 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
964 /* Initialize WORD_CHAR table, which indicate which character is
965 "word". In this case "word" means that it is the word construction
966 character used by some operators like "\<", "\>", etc. */
970 init_word_char (re_dfa_t *dfa)
973 dfa->word_ops_used = 1;
974 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
975 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
976 if (isalnum (ch) || ch == '_')
977 dfa->word_char[i] |= (bitset_word_t) 1 << j;
980 /* Free the work area which are only used while compiling. */
983 free_workarea_compile (regex_t *preg)
985 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
986 bin_tree_storage_t *storage, *next;
987 for (storage = dfa->str_tree_storage; storage; storage = next)
989 next = storage->next;
992 dfa->str_tree_storage = NULL;
993 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
994 dfa->str_tree = NULL;
995 re_free (dfa->org_indices);
996 dfa->org_indices = NULL;
999 /* Create initial states for all contexts. */
1001 static reg_errcode_t
1002 create_initial_state (re_dfa_t *dfa)
1006 re_node_set init_nodes;
1008 /* Initial states have the epsilon closure of the node which is
1009 the first node of the regular expression. */
1010 first = dfa->str_tree->first->node_idx;
1011 dfa->init_node = first;
1012 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1013 if (BE (err != REG_NOERROR, 0))
1016 /* The back-references which are in initial states can epsilon transit,
1017 since in this case all of the subexpressions can be null.
1018 Then we add epsilon closures of the nodes which are the next nodes of
1019 the back-references. */
1020 if (dfa->nbackref > 0)
1021 for (i = 0; i < init_nodes.nelem; ++i)
1023 int node_idx = init_nodes.elems[i];
1024 re_token_type_t type = dfa->nodes[node_idx].type;
1027 if (type != OP_BACK_REF)
1029 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1031 re_token_t *clexp_node;
1032 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1033 if (clexp_node->type == OP_CLOSE_SUBEXP
1034 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1037 if (clexp_idx == init_nodes.nelem)
1040 if (type == OP_BACK_REF)
1042 int dest_idx = dfa->edests[node_idx].elems[0];
1043 if (!re_node_set_contains (&init_nodes, dest_idx))
1045 reg_errcode_t err = re_node_set_merge (&init_nodes,
1048 if (err != REG_NOERROR)
1055 /* It must be the first time to invoke acquire_state. */
1056 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1057 /* We don't check ERR here, since the initial state must not be NULL. */
1058 if (BE (dfa->init_state == NULL, 0))
1060 if (dfa->init_state->has_constraint)
1062 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1064 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1066 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1070 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1071 || dfa->init_state_begbuf == NULL, 0))
1075 dfa->init_state_word = dfa->init_state_nl
1076 = dfa->init_state_begbuf = dfa->init_state;
1078 re_node_set_free (&init_nodes);
1082 #ifdef RE_ENABLE_I18N
1083 /* If it is possible to do searching in single byte encoding instead of UTF-8
1084 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1085 DFA nodes where needed. */
1088 optimize_utf8 (re_dfa_t *dfa)
1090 int node, i, mb_chars = 0, has_period = 0;
1092 for (node = 0; node < dfa->nodes_len; ++node)
1093 switch (dfa->nodes[node].type)
1096 if (dfa->nodes[node].opr.c >= 0x80)
1100 switch (dfa->nodes[node].opr.ctx_type)
1108 /* Word anchors etc. cannot be handled. It's okay to test
1109 opr.ctx_type since constraints (for all DFA nodes) are
1110 created by ORing one or more opr.ctx_type values. */
1120 case OP_DUP_ASTERISK:
1121 case OP_OPEN_SUBEXP:
1122 case OP_CLOSE_SUBEXP:
1124 case COMPLEX_BRACKET:
1126 case SIMPLE_BRACKET:
1127 /* Just double check. The non-ASCII range starts at 0x80. */
1128 assert (0x80 % BITSET_WORD_BITS == 0);
1129 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1130 if (dfa->nodes[node].opr.sbcset[i])
1137 if (mb_chars || has_period)
1138 for (node = 0; node < dfa->nodes_len; ++node)
1140 if (dfa->nodes[node].type == CHARACTER
1141 && dfa->nodes[node].opr.c >= 0x80)
1142 dfa->nodes[node].mb_partial = 0;
1143 else if (dfa->nodes[node].type == OP_PERIOD)
1144 dfa->nodes[node].type = OP_UTF8_PERIOD;
1147 /* The search can be in single byte locale. */
1148 dfa->mb_cur_max = 1;
1150 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1154 /* Analyze the structure tree, and calculate "first", "next", "edest",
1155 "eclosure", and "inveclosure". */
1157 static reg_errcode_t
1158 analyze (regex_t *preg)
1160 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1163 /* Allocate arrays. */
1164 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1165 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1166 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1167 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1168 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1169 || dfa->eclosures == NULL, 0))
1172 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1173 if (dfa->subexp_map != NULL)
1176 for (i = 0; i < preg->re_nsub; i++)
1177 dfa->subexp_map[i] = i;
1178 preorder (dfa->str_tree, optimize_subexps, dfa);
1179 for (i = 0; i < preg->re_nsub; i++)
1180 if (dfa->subexp_map[i] != i)
1182 if (i == preg->re_nsub)
1184 free (dfa->subexp_map);
1185 dfa->subexp_map = NULL;
1189 ret = postorder (dfa->str_tree, lower_subexps, preg);
1190 if (BE (ret != REG_NOERROR, 0))
1192 ret = postorder (dfa->str_tree, calc_first, dfa);
1193 if (BE (ret != REG_NOERROR, 0))
1195 preorder (dfa->str_tree, calc_next, dfa);
1196 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1197 if (BE (ret != REG_NOERROR, 0))
1199 ret = calc_eclosure (dfa);
1200 if (BE (ret != REG_NOERROR, 0))
1203 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1204 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1205 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1208 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1209 if (BE (dfa->inveclosures == NULL, 0))
1211 ret = calc_inveclosure (dfa);
1217 /* Our parse trees are very unbalanced, so we cannot use a stack to
1218 implement parse tree visits. Instead, we use parent pointers and
1219 some hairy code in these two functions. */
1220 static reg_errcode_t
1221 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1224 bin_tree_t *node, *prev;
1226 for (node = root; ; )
1228 /* Descend down the tree, preferably to the left (or to the right
1229 if that's the only child). */
1230 while (node->left || node->right)
1238 reg_errcode_t err = fn (extra, node);
1239 if (BE (err != REG_NOERROR, 0))
1241 if (node->parent == NULL)
1244 node = node->parent;
1246 /* Go up while we have a node that is reached from the right. */
1247 while (node->right == prev || node->right == NULL);
1252 static reg_errcode_t
1253 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1258 for (node = root; ; )
1260 reg_errcode_t err = fn (extra, node);
1261 if (BE (err != REG_NOERROR, 0))
1264 /* Go to the left node, or up and to the right. */
1269 bin_tree_t *prev = NULL;
1270 while (node->right == prev || node->right == NULL)
1273 node = node->parent;
1282 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1283 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1284 backreferences as well. Requires a preorder visit. */
1285 static reg_errcode_t
1286 optimize_subexps (void *extra, bin_tree_t *node)
1288 re_dfa_t *dfa = (re_dfa_t *) extra;
1290 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1292 int idx = node->token.opr.idx;
1293 node->token.opr.idx = dfa->subexp_map[idx];
1294 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1297 else if (node->token.type == SUBEXP
1298 && node->left && node->left->token.type == SUBEXP)
1300 int other_idx = node->left->token.opr.idx;
1302 node->left = node->left->left;
1304 node->left->parent = node;
1306 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1307 if (other_idx < BITSET_WORD_BITS)
1308 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1314 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1315 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1316 static reg_errcode_t
1317 lower_subexps (void *extra, bin_tree_t *node)
1319 regex_t *preg = (regex_t *) extra;
1320 reg_errcode_t err = REG_NOERROR;
1322 if (node->left && node->left->token.type == SUBEXP)
1324 node->left = lower_subexp (&err, preg, node->left);
1326 node->left->parent = node;
1328 if (node->right && node->right->token.type == SUBEXP)
1330 node->right = lower_subexp (&err, preg, node->right);
1332 node->right->parent = node;
1339 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1341 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1342 bin_tree_t *body = node->left;
1343 bin_tree_t *op, *cls, *tree1, *tree;
1346 /* We do not optimize empty subexpressions, because otherwise we may
1347 have bad CONCAT nodes with NULL children. This is obviously not
1348 very common, so we do not lose much. An example that triggers
1349 this case is the sed "script" /\(\)/x. */
1350 && node->left != NULL
1351 && (node->token.opr.idx >= BITSET_WORD_BITS
1352 || !(dfa->used_bkref_map
1353 & ((bitset_word_t) 1 << node->token.opr.idx))))
1356 /* Convert the SUBEXP node to the concatenation of an
1357 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1358 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1359 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1360 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1361 tree = create_tree (dfa, op, tree1, CONCAT);
1362 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1368 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1369 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1373 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1374 nodes. Requires a postorder visit. */
1375 static reg_errcode_t
1376 calc_first (void *extra, bin_tree_t *node)
1378 re_dfa_t *dfa = (re_dfa_t *) extra;
1379 if (node->token.type == CONCAT)
1381 node->first = node->left->first;
1382 node->node_idx = node->left->node_idx;
1387 node->node_idx = re_dfa_add_node (dfa, node->token);
1388 if (BE (node->node_idx == -1, 0))
1390 if (node->token.type == ANCHOR)
1391 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1396 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1397 static reg_errcode_t
1398 calc_next (void *extra, bin_tree_t *node)
1400 switch (node->token.type)
1402 case OP_DUP_ASTERISK:
1403 node->left->next = node;
1406 node->left->next = node->right->first;
1407 node->right->next = node->next;
1411 node->left->next = node->next;
1413 node->right->next = node->next;
1419 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1420 static reg_errcode_t
1421 link_nfa_nodes (void *extra, bin_tree_t *node)
1423 re_dfa_t *dfa = (re_dfa_t *) extra;
1424 int idx = node->node_idx;
1425 reg_errcode_t err = REG_NOERROR;
1427 switch (node->token.type)
1433 assert (node->next == NULL);
1436 case OP_DUP_ASTERISK:
1440 dfa->has_plural_match = 1;
1441 if (node->left != NULL)
1442 left = node->left->first->node_idx;
1444 left = node->next->node_idx;
1445 if (node->right != NULL)
1446 right = node->right->first->node_idx;
1448 right = node->next->node_idx;
1450 assert (right > -1);
1451 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1456 case OP_OPEN_SUBEXP:
1457 case OP_CLOSE_SUBEXP:
1458 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1462 dfa->nexts[idx] = node->next->node_idx;
1463 if (node->token.type == OP_BACK_REF)
1464 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1468 assert (!IS_EPSILON_NODE (node->token.type));
1469 dfa->nexts[idx] = node->next->node_idx;
1476 /* Duplicate the epsilon closure of the node ROOT_NODE.
1477 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1478 to their own constraint. */
1480 static reg_errcode_t
1482 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1483 int root_node, unsigned int init_constraint)
1485 int org_node, clone_node, ret;
1486 unsigned int constraint = init_constraint;
1487 for (org_node = top_org_node, clone_node = top_clone_node;;)
1489 int org_dest, clone_dest;
1490 if (dfa->nodes[org_node].type == OP_BACK_REF)
1492 /* If the back reference epsilon-transit, its destination must
1493 also have the constraint. Then duplicate the epsilon closure
1494 of the destination of the back reference, and store it in
1495 edests of the back reference. */
1496 org_dest = dfa->nexts[org_node];
1497 re_node_set_empty (dfa->edests + clone_node);
1498 clone_dest = duplicate_node (dfa, org_dest, constraint);
1499 if (BE (clone_dest == -1, 0))
1501 dfa->nexts[clone_node] = dfa->nexts[org_node];
1502 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503 if (BE (ret < 0, 0))
1506 else if (dfa->edests[org_node].nelem == 0)
1508 /* In case of the node can't epsilon-transit, don't duplicate the
1509 destination and store the original destination as the
1510 destination of the node. */
1511 dfa->nexts[clone_node] = dfa->nexts[org_node];
1514 else if (dfa->edests[org_node].nelem == 1)
1516 /* In case of the node can epsilon-transit, and it has only one
1518 org_dest = dfa->edests[org_node].elems[0];
1519 re_node_set_empty (dfa->edests + clone_node);
1520 /* If the node is root_node itself, it means the epsilon clsoure
1521 has a loop. Then tie it to the destination of the root_node. */
1522 if (org_node == root_node && clone_node != org_node)
1524 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1525 if (BE (ret < 0, 0))
1529 /* In case of the node has another constraint, add it. */
1530 constraint |= dfa->nodes[org_node].constraint;
1531 clone_dest = duplicate_node (dfa, org_dest, constraint);
1532 if (BE (clone_dest == -1, 0))
1534 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535 if (BE (ret < 0, 0))
1538 else /* dfa->edests[org_node].nelem == 2 */
1540 /* In case of the node can epsilon-transit, and it has two
1541 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1542 org_dest = dfa->edests[org_node].elems[0];
1543 re_node_set_empty (dfa->edests + clone_node);
1544 /* Search for a duplicated node which satisfies the constraint. */
1545 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1546 if (clone_dest == -1)
1548 /* There is no such duplicated node, create a new one. */
1550 clone_dest = duplicate_node (dfa, org_dest, constraint);
1551 if (BE (clone_dest == -1, 0))
1553 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554 if (BE (ret < 0, 0))
1556 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1557 root_node, constraint);
1558 if (BE (err != REG_NOERROR, 0))
1563 /* There is a duplicated node which satisfies the constraint,
1564 use it to avoid infinite loop. */
1565 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566 if (BE (ret < 0, 0))
1570 org_dest = dfa->edests[org_node].elems[1];
1571 clone_dest = duplicate_node (dfa, org_dest, constraint);
1572 if (BE (clone_dest == -1, 0))
1574 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1575 if (BE (ret < 0, 0))
1578 org_node = org_dest;
1579 clone_node = clone_dest;
1584 /* Search for a node which is duplicated from the node ORG_NODE, and
1585 satisfies the constraint CONSTRAINT. */
1588 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1589 unsigned int constraint)
1592 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1594 if (org_node == dfa->org_indices[idx]
1595 && constraint == dfa->nodes[idx].constraint)
1596 return idx; /* Found. */
1598 return -1; /* Not found. */
1601 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1602 Return the index of the new node, or -1 if insufficient storage is
1606 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1608 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1609 if (BE (dup_idx != -1, 1))
1611 dfa->nodes[dup_idx].constraint = constraint;
1612 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1613 dfa->nodes[dup_idx].duplicated = 1;
1615 /* Store the index of the original node. */
1616 dfa->org_indices[dup_idx] = org_idx;
1621 static reg_errcode_t
1622 calc_inveclosure (re_dfa_t *dfa)
1625 for (idx = 0; idx < dfa->nodes_len; ++idx)
1626 re_node_set_init_empty (dfa->inveclosures + idx);
1628 for (src = 0; src < dfa->nodes_len; ++src)
1630 int *elems = dfa->eclosures[src].elems;
1631 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1633 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1634 if (BE (ret == -1, 0))
1642 /* Calculate "eclosure" for all the node in DFA. */
1644 static reg_errcode_t
1645 calc_eclosure (re_dfa_t *dfa)
1647 int node_idx, incomplete;
1649 assert (dfa->nodes_len > 0);
1652 /* For each nodes, calculate epsilon closure. */
1653 for (node_idx = 0; ; ++node_idx)
1656 re_node_set eclosure_elem;
1657 if (node_idx == dfa->nodes_len)
1666 assert (dfa->eclosures[node_idx].nelem != -1);
1669 /* If we have already calculated, skip it. */
1670 if (dfa->eclosures[node_idx].nelem != 0)
1672 /* Calculate epsilon closure of `node_idx'. */
1673 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1674 if (BE (err != REG_NOERROR, 0))
1677 if (dfa->eclosures[node_idx].nelem == 0)
1680 re_node_set_free (&eclosure_elem);
1686 /* Calculate epsilon closure of NODE. */
1688 static reg_errcode_t
1689 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1693 re_node_set eclosure;
1696 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1697 if (BE (err != REG_NOERROR, 0))
1700 /* This indicates that we are calculating this node now.
1701 We reference this value to avoid infinite loop. */
1702 dfa->eclosures[node].nelem = -1;
1704 /* If the current node has constraints, duplicate all nodes
1705 since they must inherit the constraints. */
1706 if (dfa->nodes[node].constraint
1707 && dfa->edests[node].nelem
1708 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1710 err = duplicate_node_closure (dfa, node, node, node,
1711 dfa->nodes[node].constraint);
1712 if (BE (err != REG_NOERROR, 0))
1716 /* Expand each epsilon destination nodes. */
1717 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1718 for (i = 0; i < dfa->edests[node].nelem; ++i)
1720 re_node_set eclosure_elem;
1721 int edest = dfa->edests[node].elems[i];
1722 /* If calculating the epsilon closure of `edest' is in progress,
1723 return intermediate result. */
1724 if (dfa->eclosures[edest].nelem == -1)
1729 /* If we haven't calculated the epsilon closure of `edest' yet,
1730 calculate now. Otherwise use calculated epsilon closure. */
1731 if (dfa->eclosures[edest].nelem == 0)
1733 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1734 if (BE (err != REG_NOERROR, 0))
1738 eclosure_elem = dfa->eclosures[edest];
1739 /* Merge the epsilon closure of `edest'. */
1740 err = re_node_set_merge (&eclosure, &eclosure_elem);
1741 if (BE (err != REG_NOERROR, 0))
1743 /* If the epsilon closure of `edest' is incomplete,
1744 the epsilon closure of this node is also incomplete. */
1745 if (dfa->eclosures[edest].nelem == 0)
1748 re_node_set_free (&eclosure_elem);
1752 /* An epsilon closure includes itself. */
1753 ret = re_node_set_insert (&eclosure, node);
1754 if (BE (ret < 0, 0))
1756 if (incomplete && !root)
1757 dfa->eclosures[node].nelem = 0;
1759 dfa->eclosures[node] = eclosure;
1760 *new_set = eclosure;
1764 /* Functions for token which are used in the parser. */
1766 /* Fetch a token from INPUT.
1767 We must not use this function inside bracket expressions. */
1771 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1773 re_string_skip_bytes (input, peek_token (result, input, syntax));
1776 /* Peek a token from INPUT, and return the length of the token.
1777 We must not use this function inside bracket expressions. */
1781 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1785 if (re_string_eoi (input))
1787 token->type = END_OF_RE;
1791 c = re_string_peek_byte (input, 0);
1794 token->word_char = 0;
1795 #ifdef RE_ENABLE_I18N
1796 token->mb_partial = 0;
1797 if (input->mb_cur_max > 1 &&
1798 !re_string_first_byte (input, re_string_cur_idx (input)))
1800 token->type = CHARACTER;
1801 token->mb_partial = 1;
1808 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1810 token->type = BACK_SLASH;
1814 c2 = re_string_peek_byte_case (input, 1);
1816 token->type = CHARACTER;
1817 #ifdef RE_ENABLE_I18N
1818 if (input->mb_cur_max > 1)
1820 wint_t wc = re_string_wchar_at (input,
1821 re_string_cur_idx (input) + 1);
1822 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1826 token->word_char = IS_WORD_CHAR (c2) != 0;
1831 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1832 token->type = OP_ALT;
1834 case '1': case '2': case '3': case '4': case '5':
1835 case '6': case '7': case '8': case '9':
1836 if (!(syntax & RE_NO_BK_REFS))
1838 token->type = OP_BACK_REF;
1839 token->opr.idx = c2 - '1';
1843 if (!(syntax & RE_NO_GNU_OPS))
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_FIRST;
1850 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_LAST;
1857 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = WORD_DELIM;
1864 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = ANCHOR;
1867 token->opr.ctx_type = NOT_WORD_DELIM;
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = OP_WORD;
1875 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = OP_NOTWORD;
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = OP_SPACE;
1883 if (!(syntax & RE_NO_GNU_OPS))
1884 token->type = OP_NOTSPACE;
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = BUF_FIRST;
1894 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = ANCHOR;
1897 token->opr.ctx_type = BUF_LAST;
1901 if (!(syntax & RE_NO_BK_PARENS))
1902 token->type = OP_OPEN_SUBEXP;
1905 if (!(syntax & RE_NO_BK_PARENS))
1906 token->type = OP_CLOSE_SUBEXP;
1909 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910 token->type = OP_DUP_PLUS;
1913 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914 token->type = OP_DUP_QUESTION;
1917 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918 token->type = OP_OPEN_DUP_NUM;
1921 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922 token->type = OP_CLOSE_DUP_NUM;
1930 token->type = CHARACTER;
1931 #ifdef RE_ENABLE_I18N
1932 if (input->mb_cur_max > 1)
1934 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1935 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1939 token->word_char = IS_WORD_CHAR (token->opr.c);
1944 if (syntax & RE_NEWLINE_ALT)
1945 token->type = OP_ALT;
1948 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1949 token->type = OP_ALT;
1952 token->type = OP_DUP_ASTERISK;
1955 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956 token->type = OP_DUP_PLUS;
1959 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960 token->type = OP_DUP_QUESTION;
1963 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964 token->type = OP_OPEN_DUP_NUM;
1967 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968 token->type = OP_CLOSE_DUP_NUM;
1971 if (syntax & RE_NO_BK_PARENS)
1972 token->type = OP_OPEN_SUBEXP;
1975 if (syntax & RE_NO_BK_PARENS)
1976 token->type = OP_CLOSE_SUBEXP;
1979 token->type = OP_OPEN_BRACKET;
1982 token->type = OP_PERIOD;
1985 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1986 re_string_cur_idx (input) != 0)
1988 char prev = re_string_peek_byte (input, -1);
1989 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1992 token->type = ANCHOR;
1993 token->opr.ctx_type = LINE_FIRST;
1996 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1997 re_string_cur_idx (input) + 1 != re_string_length (input))
2000 re_string_skip_bytes (input, 1);
2001 peek_token (&next, input, syntax);
2002 re_string_skip_bytes (input, -1);
2003 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2006 token->type = ANCHOR;
2007 token->opr.ctx_type = LINE_LAST;
2015 /* Peek a token from INPUT, and return the length of the token.
2016 We must not use this function out of bracket expressions. */
2020 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2023 if (re_string_eoi (input))
2025 token->type = END_OF_RE;
2028 c = re_string_peek_byte (input, 0);
2031 #ifdef RE_ENABLE_I18N
2032 if (input->mb_cur_max > 1 &&
2033 !re_string_first_byte (input, re_string_cur_idx (input)))
2035 token->type = CHARACTER;
2038 #endif /* RE_ENABLE_I18N */
2040 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2041 && re_string_cur_idx (input) + 1 < re_string_length (input))
2043 /* In this case, '\' escape a character. */
2045 re_string_skip_bytes (input, 1);
2046 c2 = re_string_peek_byte (input, 0);
2048 token->type = CHARACTER;
2051 if (c == '[') /* '[' is a special char in a bracket exps. */
2055 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2056 c2 = re_string_peek_byte (input, 1);
2064 token->type = OP_OPEN_COLL_ELEM;
2067 token->type = OP_OPEN_EQUIV_CLASS;
2070 if (syntax & RE_CHAR_CLASSES)
2072 token->type = OP_OPEN_CHAR_CLASS;
2075 /* else fall through. */
2077 token->type = CHARACTER;
2087 token->type = OP_CHARSET_RANGE;
2090 token->type = OP_CLOSE_BRACKET;
2093 token->type = OP_NON_MATCH_LIST;
2096 token->type = CHARACTER;
2101 /* Functions for parser. */
2103 /* Entry point of the parser.
2104 Parse the regular expression REGEXP and return the structure tree.
2105 If an error has occurred, ERR is set by error code, and return NULL.
2106 This function build the following tree, from regular expression <reg_exp>:
2112 CAT means concatenation.
2113 EOR means end of regular expression. */
2116 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2119 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2120 bin_tree_t *tree, *eor, *root;
2121 re_token_t current_token;
2122 dfa->syntax = syntax;
2123 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2124 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2125 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2127 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2129 root = create_tree (dfa, tree, eor, CONCAT);
2132 if (BE (eor == NULL || root == NULL, 0))
2140 /* This function build the following tree, from regular expression
2141 <branch1>|<branch2>:
2147 ALT means alternative, which represents the operator `|'. */
2150 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2151 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2153 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2154 bin_tree_t *tree, *branch = NULL;
2155 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2156 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2159 while (token->type == OP_ALT)
2161 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2162 if (token->type != OP_ALT && token->type != END_OF_RE
2163 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2165 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2166 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2171 tree = create_tree (dfa, tree, branch, OP_ALT);
2172 if (BE (tree == NULL, 0))
2181 /* This function build the following tree, from regular expression
2188 CAT means concatenation. */
2191 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2192 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2194 bin_tree_t *tree, *exp;
2195 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2196 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2197 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2200 while (token->type != OP_ALT && token->type != END_OF_RE
2201 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2203 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2204 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2208 if (tree != NULL && exp != NULL)
2210 tree = create_tree (dfa, tree, exp, CONCAT);
2217 else if (tree == NULL)
2219 /* Otherwise exp == NULL, we don't need to create new tree. */
2224 /* This function build the following tree, from regular expression a*:
2231 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2232 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2234 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2236 switch (token->type)
2239 tree = create_token_tree (dfa, NULL, NULL, token);
2240 if (BE (tree == NULL, 0))
2245 #ifdef RE_ENABLE_I18N
2246 if (dfa->mb_cur_max > 1)
2248 while (!re_string_eoi (regexp)
2249 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2251 bin_tree_t *mbc_remain;
2252 fetch_token (token, regexp, syntax);
2253 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2254 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2255 if (BE (mbc_remain == NULL || tree == NULL, 0))
2264 case OP_OPEN_SUBEXP:
2265 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2266 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2269 case OP_OPEN_BRACKET:
2270 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2271 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2275 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2280 dfa->used_bkref_map |= 1 << token->opr.idx;
2281 tree = create_token_tree (dfa, NULL, NULL, token);
2282 if (BE (tree == NULL, 0))
2288 dfa->has_mb_node = 1;
2290 case OP_OPEN_DUP_NUM:
2291 if (syntax & RE_CONTEXT_INVALID_DUP)
2297 case OP_DUP_ASTERISK:
2299 case OP_DUP_QUESTION:
2300 if (syntax & RE_CONTEXT_INVALID_OPS)
2305 else if (syntax & RE_CONTEXT_INDEP_OPS)
2307 fetch_token (token, regexp, syntax);
2308 return parse_expression (regexp, preg, token, syntax, nest, err);
2310 /* else fall through */
2311 case OP_CLOSE_SUBEXP:
2312 if ((token->type == OP_CLOSE_SUBEXP) &&
2313 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2318 /* else fall through */
2319 case OP_CLOSE_DUP_NUM:
2320 /* We treat it as a normal character. */
2322 /* Then we can these characters as normal characters. */
2323 token->type = CHARACTER;
2324 /* mb_partial and word_char bits should be initialized already
2326 tree = create_token_tree (dfa, NULL, NULL, token);
2327 if (BE (tree == NULL, 0))
2334 if ((token->opr.ctx_type
2335 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2336 && dfa->word_ops_used == 0)
2337 init_word_char (dfa);
2338 if (token->opr.ctx_type == WORD_DELIM
2339 || token->opr.ctx_type == NOT_WORD_DELIM)
2341 bin_tree_t *tree_first, *tree_last;
2342 if (token->opr.ctx_type == WORD_DELIM)
2344 token->opr.ctx_type = WORD_FIRST;
2345 tree_first = create_token_tree (dfa, NULL, NULL, token);
2346 token->opr.ctx_type = WORD_LAST;
2350 token->opr.ctx_type = INSIDE_WORD;
2351 tree_first = create_token_tree (dfa, NULL, NULL, token);
2352 token->opr.ctx_type = INSIDE_NOTWORD;
2354 tree_last = create_token_tree (dfa, NULL, NULL, token);
2355 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2356 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2364 tree = create_token_tree (dfa, NULL, NULL, token);
2365 if (BE (tree == NULL, 0))
2371 /* We must return here, since ANCHORs can't be followed
2372 by repetition operators.
2373 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2374 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2375 fetch_token (token, regexp, syntax);
2378 tree = create_token_tree (dfa, NULL, NULL, token);
2379 if (BE (tree == NULL, 0))
2384 if (dfa->mb_cur_max > 1)
2385 dfa->has_mb_node = 1;
2389 tree = build_charclass_op (dfa, regexp->trans,
2392 token->type == OP_NOTWORD, err);
2393 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2398 tree = build_charclass_op (dfa, regexp->trans,
2401 token->type == OP_NOTSPACE, err);
2402 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2412 /* Must not happen? */
2418 fetch_token (token, regexp, syntax);
2420 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2421 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2423 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2424 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2426 /* In BRE consecutive duplications are not allowed. */
2427 if ((syntax & RE_CONTEXT_INVALID_DUP)
2428 && (token->type == OP_DUP_ASTERISK
2429 || token->type == OP_OPEN_DUP_NUM))
2439 /* This function build the following tree, from regular expression
2447 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2448 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2450 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2453 cur_nsub = preg->re_nsub++;
2455 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2457 /* The subexpression may be a null string. */
2458 if (token->type == OP_CLOSE_SUBEXP)
2462 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2463 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2465 if (BE (*err != REG_NOERROR, 0))
2469 if (cur_nsub <= '9' - '1')
2470 dfa->completed_bkref_map |= 1 << cur_nsub;
2472 tree = create_tree (dfa, tree, NULL, SUBEXP);
2473 if (BE (tree == NULL, 0))
2478 tree->token.opr.idx = cur_nsub;
2482 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2485 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2486 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2488 bin_tree_t *tree = NULL, *old_tree = NULL;
2489 int i, start, end, start_idx = re_string_cur_idx (regexp);
2490 #ifndef RE_TOKEN_INIT_BUG
2491 re_token_t start_token = *token;
2493 re_token_t start_token;
2495 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2498 if (token->type == OP_OPEN_DUP_NUM)
2501 start = fetch_number (regexp, token, syntax);
2504 if (token->type == CHARACTER && token->opr.c == ',')
2505 start = 0; /* We treat "{,m}" as "{0,m}". */
2508 *err = REG_BADBR; /* <re>{} is invalid. */
2512 if (BE (start != -2, 1))
2514 /* We treat "{n}" as "{n,n}". */
2515 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2516 : ((token->type == CHARACTER && token->opr.c == ',')
2517 ? fetch_number (regexp, token, syntax) : -2));
2519 if (BE (start == -2 || end == -2, 0))
2521 /* Invalid sequence. */
2522 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2524 if (token->type == END_OF_RE)
2532 /* If the syntax bit is set, rollback. */
2533 re_string_set_index (regexp, start_idx);
2534 *token = start_token;
2535 token->type = CHARACTER;
2536 /* mb_partial and word_char bits should be already initialized by
2541 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2543 /* First number greater than second. */
2550 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2554 fetch_token (token, regexp, syntax);
2556 if (BE (elem == NULL, 0))
2558 if (BE (start == 0 && end == 0, 0))
2560 postorder (elem, free_tree, NULL);
2564 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2565 if (BE (start > 0, 0))
2568 for (i = 2; i <= start; ++i)
2570 elem = duplicate_tree (elem, dfa);
2571 tree = create_tree (dfa, tree, elem, CONCAT);
2572 if (BE (elem == NULL || tree == NULL, 0))
2573 goto parse_dup_op_espace;
2579 /* Duplicate ELEM before it is marked optional. */
2580 elem = duplicate_tree (elem, dfa);
2586 if (elem->token.type == SUBEXP)
2587 postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2589 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2590 if (BE (tree == NULL, 0))
2591 goto parse_dup_op_espace;
2593 /* This loop is actually executed only when end != -1,
2594 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2595 already created the start+1-th copy. */
2596 for (i = start + 2; i <= end; ++i)
2598 elem = duplicate_tree (elem, dfa);
2599 tree = create_tree (dfa, tree, elem, CONCAT);
2600 if (BE (elem == NULL || tree == NULL, 0))
2601 goto parse_dup_op_espace;
2603 tree = create_tree (dfa, tree, NULL, OP_ALT);
2604 if (BE (tree == NULL, 0))
2605 goto parse_dup_op_espace;
2609 tree = create_tree (dfa, old_tree, tree, CONCAT);
2613 parse_dup_op_espace:
2618 /* Size of the names for collating symbol/equivalence_class/character_class.
2619 I'm not sure, but maybe enough. */
2620 #define BRACKET_NAME_BUF_SIZE 32
2623 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2624 Build the range expression which starts from START_ELEM, and ends
2625 at END_ELEM. The result are written to MBCSET and SBCSET.
2626 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2627 mbcset->range_ends, is a pointer argument since we may
2630 static reg_errcode_t
2632 # ifdef RE_ENABLE_I18N
2633 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2634 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2635 # else /* not RE_ENABLE_I18N */
2636 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2637 bracket_elem_t *end_elem)
2638 # endif /* not RE_ENABLE_I18N */
2640 unsigned int start_ch, end_ch;
2641 /* Equivalence Classes and Character Classes can't be a range start/end. */
2642 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2643 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2647 /* We can handle no multi character collating elements without libc
2649 if (BE ((start_elem->type == COLL_SYM
2650 && strlen ((char *) start_elem->opr.name) > 1)
2651 || (end_elem->type == COLL_SYM
2652 && strlen ((char *) end_elem->opr.name) > 1), 0))
2653 return REG_ECOLLATE;
2655 # ifdef RE_ENABLE_I18N
2660 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2662 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2663 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2665 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2666 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2670 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2671 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2672 * unsigned, so we don't have sign extension problems.
2674 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2675 ? start_ch : start_elem->opr.wch);
2676 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2677 ? end_ch : end_elem->opr.wch);
2679 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2680 ? __btowc (start_ch) : start_elem->opr.wch);
2681 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2682 ? __btowc (end_ch) : end_elem->opr.wch);
2684 if (start_wc == WEOF || end_wc == WEOF)
2685 return REG_ECOLLATE;
2686 cmp_buf[0] = start_wc;
2687 cmp_buf[4] = end_wc;
2688 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2691 /* Got valid collation sequence values, add them as a new entry.
2692 However, for !_LIBC we have no collation elements: if the
2693 character set is single byte, the single byte character set
2694 that we build below suffices. parse_bracket_exp passes
2695 no MBCSET if dfa->mb_cur_max == 1. */
2698 /* Check the space of the arrays. */
2699 if (BE (*range_alloc == mbcset->nranges, 0))
2701 /* There is not enough space, need realloc. */
2702 wchar_t *new_array_start, *new_array_end;
2705 /* +1 in case of mbcset->nranges is 0. */
2706 new_nranges = 2 * mbcset->nranges + 1;
2707 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2708 are NULL if *range_alloc == 0. */
2709 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2711 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2714 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2717 mbcset->range_starts = new_array_start;
2718 mbcset->range_ends = new_array_end;
2719 *range_alloc = new_nranges;
2722 mbcset->range_starts[mbcset->nranges] = start_wc;
2723 mbcset->range_ends[mbcset->nranges++] = end_wc;
2726 /* Build the table for single byte characters. */
2727 for (wc = 0; wc < SBC_MAX; ++wc)
2730 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2731 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2732 bitset_set (sbcset, wc);
2735 # else /* not RE_ENABLE_I18N */
2738 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2739 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2741 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2742 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2744 if (start_ch > end_ch)
2746 /* Build the table for single byte characters. */
2747 for (ch = 0; ch < SBC_MAX; ++ch)
2748 if (start_ch <= ch && ch <= end_ch)
2749 bitset_set (sbcset, ch);
2751 # endif /* not RE_ENABLE_I18N */
2754 #endif /* not _LIBC */
2757 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2758 Build the collating element which is represented by NAME.
2759 The result are written to MBCSET and SBCSET.
2760 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2761 pointer argument since we may update it. */
2763 static reg_errcode_t
2765 # ifdef RE_ENABLE_I18N
2766 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2767 int *coll_sym_alloc, const unsigned char *name)
2768 # else /* not RE_ENABLE_I18N */
2769 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2770 # endif /* not RE_ENABLE_I18N */
2772 size_t name_len = strlen ((const char *) name);
2773 if (BE (name_len != 1, 0))
2774 return REG_ECOLLATE;
2777 bitset_set (sbcset, name[0]);
2781 #endif /* not _LIBC */
2783 /* This function parse bracket expression like "[abc]", "[a-c]",
2787 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2788 reg_syntax_t syntax, reg_errcode_t *err)
2791 const unsigned char *collseqmb;
2792 const char *collseqwc;
2795 const int32_t *symb_table;
2796 const unsigned char *extra;
2798 /* Local function for parse_bracket_exp used in _LIBC environment.
2799 Seek the collating symbol entry correspondings to NAME.
2800 Return the index of the symbol in the SYMB_TABLE. */
2803 __attribute ((always_inline))
2804 seek_collating_symbol_entry (name, name_len)
2805 const unsigned char *name;
2808 int32_t hash = elem_hash ((const char *) name, name_len);
2809 int32_t elem = hash % table_size;
2810 if (symb_table[2 * elem] != 0)
2812 int32_t second = hash % (table_size - 2) + 1;
2816 /* First compare the hashing value. */
2817 if (symb_table[2 * elem] == hash
2818 /* Compare the length of the name. */
2819 && name_len == extra[symb_table[2 * elem + 1]]
2820 /* Compare the name. */
2821 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2824 /* Yep, this is the entry. */
2831 while (symb_table[2 * elem] != 0);
2836 /* Local function for parse_bracket_exp used in _LIBC environment.
2837 Look up the collation sequence value of BR_ELEM.
2838 Return the value if succeeded, UINT_MAX otherwise. */
2840 auto inline unsigned int
2841 __attribute ((always_inline))
2842 lookup_collation_sequence_value (br_elem)
2843 bracket_elem_t *br_elem;
2845 if (br_elem->type == SB_CHAR)
2848 if (MB_CUR_MAX == 1)
2851 return collseqmb[br_elem->opr.ch];
2854 wint_t wc = __btowc (br_elem->opr.ch);
2855 return __collseq_table_lookup (collseqwc, wc);
2858 else if (br_elem->type == MB_CHAR)
2861 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2863 else if (br_elem->type == COLL_SYM)
2865 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2869 elem = seek_collating_symbol_entry (br_elem->opr.name,
2871 if (symb_table[2 * elem] != 0)
2873 /* We found the entry. */
2874 idx = symb_table[2 * elem + 1];
2875 /* Skip the name of collating element name. */
2876 idx += 1 + extra[idx];
2877 /* Skip the byte sequence of the collating element. */
2878 idx += 1 + extra[idx];
2879 /* Adjust for the alignment. */
2880 idx = (idx + 3) & ~3;
2881 /* Skip the multibyte collation sequence value. */
2882 idx += sizeof (unsigned int);
2883 /* Skip the wide char sequence of the collating element. */
2884 idx += sizeof (unsigned int) *
2885 (1 + *(unsigned int *) (extra + idx));
2886 /* Return the collation sequence value. */
2887 return *(unsigned int *) (extra + idx);
2889 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2891 /* No valid character. Match it as a single byte
2893 return collseqmb[br_elem->opr.name[0]];
2896 else if (sym_name_len == 1)
2897 return collseqmb[br_elem->opr.name[0]];
2902 /* Local function for parse_bracket_exp used in _LIBC environment.
2903 Build the range expression which starts from START_ELEM, and ends
2904 at END_ELEM. The result are written to MBCSET and SBCSET.
2905 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2906 mbcset->range_ends, is a pointer argument since we may
2909 auto inline reg_errcode_t
2910 __attribute ((always_inline))
2911 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2912 re_charset_t *mbcset;
2915 bracket_elem_t *start_elem, *end_elem;
2918 uint32_t start_collseq;
2919 uint32_t end_collseq;
2921 /* Equivalence Classes and Character Classes can't be a range
2923 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2924 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2928 start_collseq = lookup_collation_sequence_value (start_elem);
2929 end_collseq = lookup_collation_sequence_value (end_elem);
2930 /* Check start/end collation sequence values. */
2931 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2932 return REG_ECOLLATE;
2933 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2936 /* Got valid collation sequence values, add them as a new entry.
2937 However, if we have no collation elements, and the character set
2938 is single byte, the single byte character set that we
2939 build below suffices. */
2940 if (nrules > 0 || dfa->mb_cur_max > 1)
2942 /* Check the space of the arrays. */
2943 if (BE (*range_alloc == mbcset->nranges, 0))
2945 /* There is not enough space, need realloc. */
2946 uint32_t *new_array_start;
2947 uint32_t *new_array_end;
2950 /* +1 in case of mbcset->nranges is 0. */
2951 new_nranges = 2 * mbcset->nranges + 1;
2952 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2954 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2957 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2960 mbcset->range_starts = new_array_start;
2961 mbcset->range_ends = new_array_end;
2962 *range_alloc = new_nranges;
2965 mbcset->range_starts[mbcset->nranges] = start_collseq;
2966 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2969 /* Build the table for single byte characters. */
2970 for (ch = 0; ch < SBC_MAX; ch++)
2972 uint32_t ch_collseq;
2974 if (MB_CUR_MAX == 1)
2977 ch_collseq = collseqmb[ch];
2979 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2980 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2981 bitset_set (sbcset, ch);
2986 /* Local function for parse_bracket_exp used in _LIBC environment.
2987 Build the collating element which is represented by NAME.
2988 The result are written to MBCSET and SBCSET.
2989 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2990 pointer argument since we may update it. */
2992 auto inline reg_errcode_t
2993 __attribute ((always_inline))
2994 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2995 re_charset_t *mbcset;
2996 int *coll_sym_alloc;
2998 const unsigned char *name;
3001 size_t name_len = strlen ((const char *) name);
3004 elem = seek_collating_symbol_entry (name, name_len);
3005 if (symb_table[2 * elem] != 0)
3007 /* We found the entry. */
3008 idx = symb_table[2 * elem + 1];
3009 /* Skip the name of collating element name. */
3010 idx += 1 + extra[idx];
3012 else if (symb_table[2 * elem] == 0 && name_len == 1)
3014 /* No valid character, treat it as a normal
3016 bitset_set (sbcset, name[0]);
3020 return REG_ECOLLATE;
3022 /* Got valid collation sequence, add it as a new entry. */
3023 /* Check the space of the arrays. */
3024 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3026 /* Not enough, realloc it. */
3027 /* +1 in case of mbcset->ncoll_syms is 0. */
3028 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3029 /* Use realloc since mbcset->coll_syms is NULL
3031 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3032 new_coll_sym_alloc);
3033 if (BE (new_coll_syms == NULL, 0))
3035 mbcset->coll_syms = new_coll_syms;
3036 *coll_sym_alloc = new_coll_sym_alloc;
3038 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3043 if (BE (name_len != 1, 0))
3044 return REG_ECOLLATE;
3047 bitset_set (sbcset, name[0]);
3054 re_token_t br_token;
3055 re_bitset_ptr_t sbcset;
3056 #ifdef RE_ENABLE_I18N
3057 re_charset_t *mbcset;
3058 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3059 int equiv_class_alloc = 0, char_class_alloc = 0;
3060 #endif /* not RE_ENABLE_I18N */
3062 bin_tree_t *work_tree;
3064 int first_round = 1;
3066 collseqmb = (const unsigned char *)
3067 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3068 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3074 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3075 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3076 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3077 _NL_COLLATE_SYMB_TABLEMB);
3078 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3079 _NL_COLLATE_SYMB_EXTRAMB);
3082 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3083 #ifdef RE_ENABLE_I18N
3084 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3085 #endif /* RE_ENABLE_I18N */
3086 #ifdef RE_ENABLE_I18N
3087 if (BE (sbcset == NULL || mbcset == NULL, 0))
3089 if (BE (sbcset == NULL, 0))
3090 #endif /* RE_ENABLE_I18N */
3096 token_len = peek_token_bracket (token, regexp, syntax);
3097 if (BE (token->type == END_OF_RE, 0))
3100 goto parse_bracket_exp_free_return;
3102 if (token->type == OP_NON_MATCH_LIST)
3104 #ifdef RE_ENABLE_I18N
3105 mbcset->non_match = 1;
3106 #endif /* not RE_ENABLE_I18N */
3108 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3109 bitset_set (sbcset, '\n');
3110 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3111 token_len = peek_token_bracket (token, regexp, syntax);
3112 if (BE (token->type == END_OF_RE, 0))
3115 goto parse_bracket_exp_free_return;
3119 /* We treat the first ']' as a normal character. */
3120 if (token->type == OP_CLOSE_BRACKET)
3121 token->type = CHARACTER;
3125 bracket_elem_t start_elem, end_elem;
3126 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3127 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3129 int token_len2 = 0, is_range_exp = 0;
3132 start_elem.opr.name = start_name_buf;
3133 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3134 syntax, first_round);
3135 if (BE (ret != REG_NOERROR, 0))
3138 goto parse_bracket_exp_free_return;
3142 /* Get information about the next token. We need it in any case. */
3143 token_len = peek_token_bracket (token, regexp, syntax);
3145 /* Do not check for ranges if we know they are not allowed. */
3146 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3148 if (BE (token->type == END_OF_RE, 0))
3151 goto parse_bracket_exp_free_return;
3153 if (token->type == OP_CHARSET_RANGE)
3155 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3156 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3157 if (BE (token2.type == END_OF_RE, 0))
3160 goto parse_bracket_exp_free_return;
3162 if (token2.type == OP_CLOSE_BRACKET)
3164 /* We treat the last '-' as a normal character. */
3165 re_string_skip_bytes (regexp, -token_len);
3166 token->type = CHARACTER;
3173 if (is_range_exp == 1)
3175 end_elem.opr.name = end_name_buf;
3176 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3178 if (BE (ret != REG_NOERROR, 0))
3181 goto parse_bracket_exp_free_return;
3184 token_len = peek_token_bracket (token, regexp, syntax);
3187 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3188 &start_elem, &end_elem);
3190 # ifdef RE_ENABLE_I18N
3191 *err = build_range_exp (sbcset,
3192 dfa->mb_cur_max > 1 ? mbcset : NULL,
3193 &range_alloc, &start_elem, &end_elem);
3195 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3197 #endif /* RE_ENABLE_I18N */
3198 if (BE (*err != REG_NOERROR, 0))
3199 goto parse_bracket_exp_free_return;
3203 switch (start_elem.type)
3206 bitset_set (sbcset, start_elem.opr.ch);
3208 #ifdef RE_ENABLE_I18N
3210 /* Check whether the array has enough space. */
3211 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3213 wchar_t *new_mbchars;
3214 /* Not enough, realloc it. */
3215 /* +1 in case of mbcset->nmbchars is 0. */
3216 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3217 /* Use realloc since array is NULL if *alloc == 0. */
3218 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3220 if (BE (new_mbchars == NULL, 0))
3221 goto parse_bracket_exp_espace;
3222 mbcset->mbchars = new_mbchars;
3224 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3226 #endif /* RE_ENABLE_I18N */
3228 *err = build_equiv_class (sbcset,
3229 #ifdef RE_ENABLE_I18N
3230 mbcset, &equiv_class_alloc,
3231 #endif /* RE_ENABLE_I18N */
3232 start_elem.opr.name);
3233 if (BE (*err != REG_NOERROR, 0))
3234 goto parse_bracket_exp_free_return;
3237 *err = build_collating_symbol (sbcset,
3238 #ifdef RE_ENABLE_I18N
3239 mbcset, &coll_sym_alloc,
3240 #endif /* RE_ENABLE_I18N */
3241 start_elem.opr.name);
3242 if (BE (*err != REG_NOERROR, 0))
3243 goto parse_bracket_exp_free_return;
3246 *err = build_charclass (regexp->trans, sbcset,
3247 #ifdef RE_ENABLE_I18N
3248 mbcset, &char_class_alloc,
3249 #endif /* RE_ENABLE_I18N */
3250 (const char *) start_elem.opr.name, syntax);
3251 if (BE (*err != REG_NOERROR, 0))
3252 goto parse_bracket_exp_free_return;
3259 if (BE (token->type == END_OF_RE, 0))
3262 goto parse_bracket_exp_free_return;
3264 if (token->type == OP_CLOSE_BRACKET)
3268 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3270 /* If it is non-matching list. */
3272 bitset_not (sbcset);
3274 #ifdef RE_ENABLE_I18N
3275 /* Ensure only single byte characters are set. */
3276 if (dfa->mb_cur_max > 1)
3277 bitset_mask (sbcset, dfa->sb_char);
3279 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3280 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3281 || mbcset->non_match)))
3283 bin_tree_t *mbc_tree;
3285 /* Build a tree for complex bracket. */
3286 dfa->has_mb_node = 1;
3287 br_token.type = COMPLEX_BRACKET;
3288 br_token.opr.mbcset = mbcset;
3289 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3290 if (BE (mbc_tree == NULL, 0))
3291 goto parse_bracket_exp_espace;
3292 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3293 if (sbcset[sbc_idx])
3295 /* If there are no bits set in sbcset, there is no point
3296 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3297 if (sbc_idx < BITSET_WORDS)
3299 /* Build a tree for simple bracket. */
3300 br_token.type = SIMPLE_BRACKET;
3301 br_token.opr.sbcset = sbcset;
3302 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3303 if (BE (work_tree == NULL, 0))
3304 goto parse_bracket_exp_espace;
3306 /* Then join them by ALT node. */
3307 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3308 if (BE (work_tree == NULL, 0))
3309 goto parse_bracket_exp_espace;
3314 work_tree = mbc_tree;
3318 #endif /* not RE_ENABLE_I18N */
3320 #ifdef RE_ENABLE_I18N
3321 free_charset (mbcset);
3323 /* Build a tree for simple bracket. */
3324 br_token.type = SIMPLE_BRACKET;
3325 br_token.opr.sbcset = sbcset;
3326 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3327 if (BE (work_tree == NULL, 0))
3328 goto parse_bracket_exp_espace;
3332 parse_bracket_exp_espace:
3334 parse_bracket_exp_free_return:
3336 #ifdef RE_ENABLE_I18N
3337 free_charset (mbcset);
3338 #endif /* RE_ENABLE_I18N */
3342 /* Parse an element in the bracket expression. */
3344 static reg_errcode_t
3345 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3346 re_token_t *token, int token_len, re_dfa_t *dfa,
3347 reg_syntax_t syntax, int accept_hyphen)
3349 #ifdef RE_ENABLE_I18N
3351 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3352 if (cur_char_size > 1)
3354 elem->type = MB_CHAR;
3355 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3356 re_string_skip_bytes (regexp, cur_char_size);
3359 #endif /* RE_ENABLE_I18N */
3360 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3361 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3362 || token->type == OP_OPEN_EQUIV_CLASS)
3363 return parse_bracket_symbol (elem, regexp, token);
3364 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3366 /* A '-' must only appear as anything but a range indicator before
3367 the closing bracket. Everything else is an error. */
3369 (void) peek_token_bracket (&token2, regexp, syntax);
3370 if (token2.type != OP_CLOSE_BRACKET)
3371 /* The actual error value is not standardized since this whole
3372 case is undefined. But ERANGE makes good sense. */
3375 elem->type = SB_CHAR;
3376 elem->opr.ch = token->opr.c;
3380 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3381 such as [:<character_class>:], [.<collating_element>.], and
3382 [=<equivalent_class>=]. */
3384 static reg_errcode_t
3385 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3388 unsigned char ch, delim = token->opr.c;
3390 if (re_string_eoi(regexp))
3394 if (i >= BRACKET_NAME_BUF_SIZE)
3396 if (token->type == OP_OPEN_CHAR_CLASS)
3397 ch = re_string_fetch_byte_case (regexp);
3399 ch = re_string_fetch_byte (regexp);
3400 if (re_string_eoi(regexp))
3402 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3404 elem->opr.name[i] = ch;
3406 re_string_skip_bytes (regexp, 1);
3407 elem->opr.name[i] = '\0';
3408 switch (token->type)
3410 case OP_OPEN_COLL_ELEM:
3411 elem->type = COLL_SYM;
3413 case OP_OPEN_EQUIV_CLASS:
3414 elem->type = EQUIV_CLASS;
3416 case OP_OPEN_CHAR_CLASS:
3417 elem->type = CHAR_CLASS;
3425 /* Helper function for parse_bracket_exp.
3426 Build the equivalence class which is represented by NAME.
3427 The result are written to MBCSET and SBCSET.
3428 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3429 is a pointer argument since we may update it. */
3431 static reg_errcode_t
3432 #ifdef RE_ENABLE_I18N
3433 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3434 int *equiv_class_alloc, const unsigned char *name)
3435 #else /* not RE_ENABLE_I18N */
3436 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3437 #endif /* not RE_ENABLE_I18N */
3440 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3443 const int32_t *table, *indirect;
3444 const unsigned char *weights, *extra, *cp;
3445 unsigned char char_buf[2];
3449 /* This #include defines a local function! */
3450 # include <locale/weight.h>
3451 /* Calculate the index for equivalence class. */
3453 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3454 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3455 _NL_COLLATE_WEIGHTMB);
3456 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3457 _NL_COLLATE_EXTRAMB);
3458 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3459 _NL_COLLATE_INDIRECTMB);
3460 idx1 = findidx (&cp);
3461 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3462 /* This isn't a valid character. */
3463 return REG_ECOLLATE;
3465 /* Build single byte matcing table for this equivalence class. */
3466 char_buf[1] = (unsigned char) '\0';
3467 len = weights[idx1 & 0xffffff];
3468 for (ch = 0; ch < SBC_MAX; ++ch)
3472 idx2 = findidx (&cp);
3477 /* This isn't a valid character. */
3479 /* Compare only if the length matches and the collation rule
3480 index is the same. */
3481 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3485 while (cnt <= len &&
3486 weights[(idx1 & 0xffffff) + 1 + cnt]
3487 == weights[(idx2 & 0xffffff) + 1 + cnt])
3491 bitset_set (sbcset, ch);
3494 /* Check whether the array has enough space. */
3495 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3497 /* Not enough, realloc it. */
3498 /* +1 in case of mbcset->nequiv_classes is 0. */
3499 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3500 /* Use realloc since the array is NULL if *alloc == 0. */
3501 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3503 new_equiv_class_alloc);
3504 if (BE (new_equiv_classes == NULL, 0))
3506 mbcset->equiv_classes = new_equiv_classes;
3507 *equiv_class_alloc = new_equiv_class_alloc;
3509 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3514 if (BE (strlen ((const char *) name) != 1, 0))
3515 return REG_ECOLLATE;
3516 bitset_set (sbcset, *name);
3521 /* Helper function for parse_bracket_exp.
3522 Build the character class which is represented by NAME.
3523 The result are written to MBCSET and SBCSET.
3524 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3525 is a pointer argument since we may update it. */
3527 static reg_errcode_t
3528 #ifdef RE_ENABLE_I18N
3529 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3530 re_charset_t *mbcset, int *char_class_alloc,
3531 const char *class_name, reg_syntax_t syntax)
3532 #else /* not RE_ENABLE_I18N */
3533 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3534 const char *class_name, reg_syntax_t syntax)
3535 #endif /* not RE_ENABLE_I18N */
3539 /* In case of REG_ICASE "upper" and "lower" match the both of
3540 upper and lower cases. */
3541 if ((syntax & RE_ICASE)
3542 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3543 class_name = "alpha";
3545 #ifdef RE_ENABLE_I18N
3546 /* Check the space of the arrays. */
3547 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3549 /* Not enough, realloc it. */
3550 /* +1 in case of mbcset->nchar_classes is 0. */
3551 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3552 /* Use realloc since array is NULL if *alloc == 0. */
3553 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3554 new_char_class_alloc);
3555 if (BE (new_char_classes == NULL, 0))
3557 mbcset->char_classes = new_char_classes;
3558 *char_class_alloc = new_char_class_alloc;
3560 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3561 #endif /* RE_ENABLE_I18N */
3563 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3565 if (BE (trans != NULL, 0)) \
3567 for (i = 0; i < SBC_MAX; ++i) \
3568 if (ctype_func (i)) \
3569 bitset_set (sbcset, trans[i]); \
3573 for (i = 0; i < SBC_MAX; ++i) \
3574 if (ctype_func (i)) \
3575 bitset_set (sbcset, i); \
3579 if (strcmp (class_name, "alnum") == 0)
3580 BUILD_CHARCLASS_LOOP (isalnum);
3581 else if (strcmp (class_name, "cntrl") == 0)
3582 BUILD_CHARCLASS_LOOP (iscntrl);
3583 else if (strcmp (class_name, "lower") == 0)
3584 BUILD_CHARCLASS_LOOP (islower);
3585 else if (strcmp (class_name, "space") == 0)
3586 BUILD_CHARCLASS_LOOP (isspace);
3587 else if (strcmp (class_name, "alpha") == 0)
3588 BUILD_CHARCLASS_LOOP (isalpha);
3589 else if (strcmp (class_name, "digit") == 0)
3590 BUILD_CHARCLASS_LOOP (isdigit);
3591 else if (strcmp (class_name, "print") == 0)
3592 BUILD_CHARCLASS_LOOP (isprint);
3593 else if (strcmp (class_name, "upper") == 0)
3594 BUILD_CHARCLASS_LOOP (isupper);
3595 else if (strcmp (class_name, "blank") == 0)
3597 BUILD_CHARCLASS_LOOP (isblank);
3599 /* see comments above */
3600 BUILD_CHARCLASS_LOOP (is_blank);
3602 else if (strcmp (class_name, "graph") == 0)
3603 BUILD_CHARCLASS_LOOP (isgraph);
3604 else if (strcmp (class_name, "punct") == 0)
3605 BUILD_CHARCLASS_LOOP (ispunct);
3606 else if (strcmp (class_name, "xdigit") == 0)
3607 BUILD_CHARCLASS_LOOP (isxdigit);
3615 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3616 const char *class_name,
3617 const char *extra, int non_match,
3620 re_bitset_ptr_t sbcset;
3621 #ifdef RE_ENABLE_I18N
3622 re_charset_t *mbcset;
3624 #endif /* not RE_ENABLE_I18N */
3626 re_token_t br_token;
3629 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3630 #ifdef RE_ENABLE_I18N
3631 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3632 #endif /* RE_ENABLE_I18N */
3634 #ifdef RE_ENABLE_I18N
3635 if (BE (sbcset == NULL || mbcset == NULL, 0))
3636 #else /* not RE_ENABLE_I18N */
3637 if (BE (sbcset == NULL, 0))
3638 #endif /* not RE_ENABLE_I18N */
3646 #ifdef RE_ENABLE_I18N
3647 mbcset->non_match = 1;
3648 #endif /* not RE_ENABLE_I18N */
3651 /* We don't care the syntax in this case. */
3652 ret = build_charclass (trans, sbcset,
3653 #ifdef RE_ENABLE_I18N
3655 #endif /* RE_ENABLE_I18N */
3658 if (BE (ret != REG_NOERROR, 0))
3661 #ifdef RE_ENABLE_I18N
3662 free_charset (mbcset);
3663 #endif /* RE_ENABLE_I18N */
3667 /* \w match '_' also. */
3668 for (; *extra; extra++)
3669 bitset_set (sbcset, *extra);
3671 /* If it is non-matching list. */
3673 bitset_not (sbcset);
3675 #ifdef RE_ENABLE_I18N
3676 /* Ensure only single byte characters are set. */
3677 if (dfa->mb_cur_max > 1)
3678 bitset_mask (sbcset, dfa->sb_char);
3681 /* Build a tree for simple bracket. */
3682 br_token.type = SIMPLE_BRACKET;
3683 br_token.opr.sbcset = sbcset;
3684 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3685 if (BE (tree == NULL, 0))
3686 goto build_word_op_espace;
3688 #ifdef RE_ENABLE_I18N
3689 if (dfa->mb_cur_max > 1)
3691 bin_tree_t *mbc_tree;
3692 /* Build a tree for complex bracket. */
3693 br_token.type = COMPLEX_BRACKET;
3694 br_token.opr.mbcset = mbcset;
3695 dfa->has_mb_node = 1;
3696 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3697 if (BE (mbc_tree == NULL, 0))
3698 goto build_word_op_espace;
3699 /* Then join them by ALT node. */
3700 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3701 if (BE (mbc_tree != NULL, 1))
3706 free_charset (mbcset);
3709 #else /* not RE_ENABLE_I18N */
3711 #endif /* not RE_ENABLE_I18N */
3713 build_word_op_espace:
3715 #ifdef RE_ENABLE_I18N
3716 free_charset (mbcset);
3717 #endif /* RE_ENABLE_I18N */
3722 /* This is intended for the expressions like "a{1,3}".
3723 Fetch a number from `input', and return the number.
3724 Return -1, if the number field is empty like "{,1}".
3725 Return -2, if an error has occurred. */
3728 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3734 fetch_token (token, input, syntax);
3736 if (BE (token->type == END_OF_RE, 0))
3738 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3740 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3741 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3742 num = (num > RE_DUP_MAX) ? -2 : num;
3747 #ifdef RE_ENABLE_I18N
3749 free_charset (re_charset_t *cset)
3751 re_free (cset->mbchars);
3753 re_free (cset->coll_syms);
3754 re_free (cset->equiv_classes);
3755 re_free (cset->range_starts);
3756 re_free (cset->range_ends);
3758 re_free (cset->char_classes);
3761 #endif /* RE_ENABLE_I18N */
3763 /* Functions for binary tree operation. */
3765 /* Create a tree node. */
3768 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3769 re_token_type_t type)
3773 return create_token_tree (dfa, left, right, &t);
3777 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3778 const re_token_t *token)
3781 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3783 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3785 if (storage == NULL)
3787 storage->next = dfa->str_tree_storage;
3788 dfa->str_tree_storage = storage;
3789 dfa->str_tree_storage_idx = 0;
3791 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3793 tree->parent = NULL;
3795 tree->right = right;
3796 tree->token = *token;
3797 tree->token.duplicated = 0;
3798 tree->token.opt_subexp = 0;
3801 tree->node_idx = -1;
3804 left->parent = tree;
3806 right->parent = tree;
3810 /* Mark the tree SRC as an optional subexpression.
3811 To be called from preorder or postorder. */
3813 static reg_errcode_t
3814 mark_opt_subexp (void *extra, bin_tree_t *node)
3816 int idx = (int) (intptr_t) extra;
3817 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3818 node->token.opt_subexp = 1;
3823 /* Free the allocated memory inside NODE. */
3826 free_token (re_token_t *node)
3828 #ifdef RE_ENABLE_I18N
3829 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3830 free_charset (node->opr.mbcset);
3832 #endif /* RE_ENABLE_I18N */
3833 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3834 re_free (node->opr.sbcset);
3837 /* Worker function for tree walking. Free the allocated memory inside NODE
3838 and its children. */
3840 static reg_errcode_t
3841 free_tree (void *extra, bin_tree_t *node)
3843 free_token (&node->token);
3848 /* Duplicate the node SRC, and return new node. This is a preorder
3849 visit similar to the one implemented by the generic visitor, but
3850 we need more infrastructure to maintain two parallel trees --- so,
3851 it's easier to duplicate. */
3854 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3856 const bin_tree_t *node;
3857 bin_tree_t *dup_root;
3858 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3860 for (node = root; ; )
3862 /* Create a new tree and link it back to the current parent. */
3863 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3866 (*p_new)->parent = dup_node;
3867 (*p_new)->token.duplicated = 1;
3870 /* Go to the left node, or up and to the right. */
3874 p_new = &dup_node->left;
3878 const bin_tree_t *prev = NULL;
3879 while (node->right == prev || node->right == NULL)
3882 node = node->parent;
3883 dup_node = dup_node->parent;
3888 p_new = &dup_node->right;