1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 re_token_t *token, int token_len,
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 int *char_class_alloc,
98 const char *class_name,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105 const char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const char *class_name,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid[] attribute_hidden =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx[] attribute_hidden =
204 /* Entry points for GNU code. */
209 /* For ZOS USS we must define btowc */
220 mbtowc (wtmp, tmp, 1);
225 /* re_compile_pattern is the GNU regular expression compiler: it
226 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
227 Returns 0 if the pattern was valid, otherwise an error string.
229 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
230 are set in BUFP on entry. */
233 re_compile_pattern (const char *pattern,
235 struct re_pattern_buffer *bufp)
239 /* And GNU code determines whether or not to get register information
240 by passing null for the REGS argument to re_match, etc., not by
241 setting no_sub, unless RE_NO_SUB is set. */
242 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
244 /* Match anchors at newline. */
245 bufp->newline_anchor = 1;
247 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
251 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
254 weak_alias (__re_compile_pattern, re_compile_pattern)
257 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
258 also be assigned to arbitrarily: each pattern buffer stores its own
259 syntax, so it can be changed between regex compilations. */
260 /* This has no initializer because initialized variables in Emacs
261 become read-only after dumping. */
262 reg_syntax_t re_syntax_options;
265 /* Specify the precise syntax of regexps for compilation. This provides
266 for compatibility for various utilities which historically have
267 different, incompatible syntaxes.
269 The argument SYNTAX is a bit mask comprised of the various bits
270 defined in regex.h. We return the old syntax. */
273 re_set_syntax (reg_syntax_t syntax)
275 reg_syntax_t ret = re_syntax_options;
277 re_syntax_options = syntax;
281 weak_alias (__re_set_syntax, re_set_syntax)
285 re_compile_fastmap (struct re_pattern_buffer *bufp)
287 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
288 char *fastmap = bufp->fastmap;
290 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
291 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
292 if (dfa->init_state != dfa->init_state_word)
293 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
294 if (dfa->init_state != dfa->init_state_nl)
295 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
296 if (dfa->init_state != dfa->init_state_begbuf)
297 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
298 bufp->fastmap_accurate = 1;
302 weak_alias (__re_compile_fastmap, re_compile_fastmap)
306 __attribute ((always_inline))
307 re_set_fastmap (char *fastmap, int icase, int ch)
311 fastmap[tolower (ch)] = 1;
314 /* Helper function for re_compile_fastmap.
315 Compile fastmap for the initial_state INIT_STATE. */
318 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
321 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
323 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
324 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
326 int node = init_state->nodes.elems[node_cnt];
327 re_token_type_t type = dfa->nodes[node].type;
329 if (type == CHARACTER)
331 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
332 #ifdef RE_ENABLE_I18N
333 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
335 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
340 *p++ = dfa->nodes[node].opr.c;
341 while (++node < dfa->nodes_len
342 && dfa->nodes[node].type == CHARACTER
343 && dfa->nodes[node].mb_partial)
344 *p++ = dfa->nodes[node].opr.c;
345 memset (&state, '\0', sizeof (state));
346 if (__mbrtowc (&wc, (const char *) buf, p - buf,
348 && (__wcrtomb ((char *) buf, towlower (wc), &state)
350 re_set_fastmap (fastmap, 0, buf[0]);
355 else if (type == SIMPLE_BRACKET)
358 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
361 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
362 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
363 if (w & ((bitset_word_t) 1 << j))
364 re_set_fastmap (fastmap, icase, ch);
367 #ifdef RE_ENABLE_I18N
368 else if (type == COMPLEX_BRACKET)
370 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
374 /* See if we have to try all bytes which start multiple collation
376 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
377 collation element, and don't catch 'b' since 'b' is
378 the only collation element which starts from 'b' (and
379 it is caught by SIMPLE_BRACKET). */
380 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
381 && (cset->ncoll_syms || cset->nranges))
383 const int32_t *table = (const int32_t *)
384 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
385 for (i = 0; i < SBC_MAX; ++i)
387 re_set_fastmap (fastmap, icase, i);
391 /* See if we have to start the match at all multibyte characters,
392 i.e. where we would not find an invalid sequence. This only
393 applies to multibyte character sets; for single byte character
394 sets, the SIMPLE_BRACKET again suffices. */
395 if (dfa->mb_cur_max > 1
396 && (cset->nchar_classes || cset->non_match || cset->nranges
398 || cset->nequiv_classes
406 memset (&mbs, 0, sizeof (mbs));
407 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
408 re_set_fastmap (fastmap, false, (int) c);
415 /* ... Else catch all bytes which can start the mbchars. */
416 for (i = 0; i < cset->nmbchars; ++i)
420 memset (&state, '\0', sizeof (state));
421 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
422 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
423 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
425 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
427 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
432 #endif /* RE_ENABLE_I18N */
433 else if (type == OP_PERIOD
434 #ifdef RE_ENABLE_I18N
435 || type == OP_UTF8_PERIOD
436 #endif /* RE_ENABLE_I18N */
437 || type == END_OF_RE)
439 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
440 if (type == END_OF_RE)
441 bufp->can_be_null = 1;
447 /* Entry point for POSIX code. */
448 /* regcomp takes a regular expression as a string and compiles it.
450 PREG is a regex_t *. We do not expect any fields to be initialized,
451 since POSIX says we shouldn't. Thus, we set
453 `buffer' to the compiled pattern;
454 `used' to the length of the compiled pattern;
455 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
456 REG_EXTENDED bit in CFLAGS is set; otherwise, to
457 RE_SYNTAX_POSIX_BASIC;
458 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
459 `fastmap' to an allocated space for the fastmap;
460 `fastmap_accurate' to zero;
461 `re_nsub' to the number of subexpressions in PATTERN.
463 PATTERN is the address of the pattern string.
465 CFLAGS is a series of bits which affect compilation.
467 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
468 use POSIX basic syntax.
470 If REG_NEWLINE is set, then . and [^...] don't match newline.
471 Also, regexec will try a match beginning after every newline.
473 If REG_ICASE is set, then we considers upper- and lowercase
474 versions of letters to be equivalent when matching.
476 If REG_NOSUB is set, then when PREG is passed to regexec, that
477 routine will report only success or failure, and nothing about the
480 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
481 the return codes and their meanings.) */
484 regcomp (regex_t *__restrict preg,
485 const char *__restrict pattern,
489 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
490 : RE_SYNTAX_POSIX_BASIC);
496 /* Try to allocate space for the fastmap. */
497 preg->fastmap = re_malloc (char, SBC_MAX);
498 if (BE (preg->fastmap == NULL, 0))
501 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
503 /* If REG_NEWLINE is set, newlines are treated differently. */
504 if (cflags & REG_NEWLINE)
505 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
506 syntax &= ~RE_DOT_NEWLINE;
507 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
508 /* It also changes the matching behavior. */
509 preg->newline_anchor = 1;
512 preg->newline_anchor = 0;
513 preg->no_sub = !!(cflags & REG_NOSUB);
514 preg->translate = NULL;
516 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
518 /* POSIX doesn't distinguish between an unmatched open-group and an
519 unmatched close-group: both are REG_EPAREN. */
520 if (ret == REG_ERPAREN)
523 /* We have already checked preg->fastmap != NULL. */
524 if (BE (ret == REG_NOERROR, 1))
525 /* Compute the fastmap now, since regexec cannot modify the pattern
526 buffer. This function never fails in this implementation. */
527 (void) re_compile_fastmap (preg);
530 /* Some error occurred while compiling the expression. */
531 re_free (preg->fastmap);
532 preg->fastmap = NULL;
538 weak_alias (__regcomp, regcomp)
541 /* Returns a message corresponding to an error code, ERRCODE, returned
542 from either regcomp or regexec. We don't use PREG here. */
545 regerror(int errcode, const regex_t *__restrict preg,
546 char *__restrict errbuf, size_t errbuf_size)
552 || errcode >= (int) (sizeof (__re_error_msgid_idx)
553 / sizeof (__re_error_msgid_idx[0])), 0))
554 /* Only error codes returned by the rest of the code should be passed
555 to this routine. If we are given anything else, or if other regex
556 code generates an invalid error code, then the program has a bug.
557 Dump core so we can fix it. */
560 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562 msg_size = strlen (msg) + 1; /* Includes the null. */
564 if (BE (errbuf_size != 0, 1))
566 if (BE (msg_size > errbuf_size, 0))
568 memcpy (errbuf, msg, errbuf_size - 1);
569 errbuf[errbuf_size - 1] = 0;
572 memcpy (errbuf, msg, msg_size);
578 weak_alias (__regerror, regerror)
582 #ifdef RE_ENABLE_I18N
583 /* This static array is used for the map to single-byte characters when
584 UTF-8 is used. Otherwise we would allocate memory just to initialize
585 it the same all the time. UTF-8 is the preferred encoding so this is
586 a worthwhile optimization. */
588 static const bitset_t utf8_sb_map = {
589 /* Set the first 128 bits. */
590 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
592 #else /* ! (__GNUC__ >= 3) */
593 static bitset_t utf8_sb_map;
594 #endif /* __GNUC__ >= 3 */
595 #endif /* RE_ENABLE_I18N */
599 free_dfa_content (re_dfa_t *dfa)
604 for (i = 0; i < dfa->nodes_len; ++i)
605 free_token (dfa->nodes + i);
606 re_free (dfa->nexts);
607 for (i = 0; i < dfa->nodes_len; ++i)
609 if (dfa->eclosures != NULL)
610 re_node_set_free (dfa->eclosures + i);
611 if (dfa->inveclosures != NULL)
612 re_node_set_free (dfa->inveclosures + i);
613 if (dfa->edests != NULL)
614 re_node_set_free (dfa->edests + i);
616 re_free (dfa->edests);
617 re_free (dfa->eclosures);
618 re_free (dfa->inveclosures);
619 re_free (dfa->nodes);
621 if (dfa->state_table)
622 for (i = 0; i <= dfa->state_hash_mask; ++i)
624 struct re_state_table_entry *entry = dfa->state_table + i;
625 for (j = 0; j < entry->num; ++j)
627 re_dfastate_t *state = entry->array[j];
630 re_free (entry->array);
632 re_free (dfa->state_table);
633 #ifdef RE_ENABLE_I18N
634 if (dfa->sb_char != utf8_sb_map)
635 re_free (dfa->sb_char);
637 re_free (dfa->subexp_map);
639 re_free (dfa->re_str);
646 /* Free dynamically allocated space used by PREG. */
649 regfree (regex_t *preg)
651 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
652 if (BE (dfa != NULL, 1))
653 free_dfa_content (dfa);
657 re_free (preg->fastmap);
658 preg->fastmap = NULL;
660 re_free (preg->translate);
661 preg->translate = NULL;
664 weak_alias (__regfree, regfree)
667 /* Entry points compatible with 4.2 BSD regex library. We don't define
668 them unless specifically requested. */
670 #if defined _REGEX_RE_COMP || defined _LIBC
672 /* BSD has one and only one pattern buffer. */
673 static struct re_pattern_buffer re_comp_buf;
677 /* Make these definitions weak in libc, so POSIX programs can redefine
678 these names if they don't use our functions, and still use
679 regcomp/regexec above without link errors. */
690 if (!re_comp_buf.buffer)
691 return gettext ("No previous regular expression");
695 if (re_comp_buf.buffer)
697 fastmap = re_comp_buf.fastmap;
698 re_comp_buf.fastmap = NULL;
699 __regfree (&re_comp_buf);
700 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
701 re_comp_buf.fastmap = fastmap;
704 if (re_comp_buf.fastmap == NULL)
706 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
707 if (re_comp_buf.fastmap == NULL)
708 return (char *) gettext (__re_error_msgid
709 + __re_error_msgid_idx[(int) REG_ESPACE]);
712 /* Since `re_exec' always passes NULL for the `regs' argument, we
713 don't need to initialize the pattern buffer fields which affect it. */
715 /* Match anchors at newlines. */
716 re_comp_buf.newline_anchor = 1;
718 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
723 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
724 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
728 libc_freeres_fn (free_mem)
730 __regfree (&re_comp_buf);
734 #endif /* _REGEX_RE_COMP */
736 /* Internal entry point.
737 Compile the regular expression PATTERN, whose length is LENGTH.
738 SYNTAX indicate regular expression's syntax. */
741 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
744 reg_errcode_t err = REG_NOERROR;
748 /* Initialize the pattern buffer. */
749 preg->fastmap_accurate = 0;
750 preg->syntax = syntax;
751 preg->not_bol = preg->not_eol = 0;
754 preg->can_be_null = 0;
755 preg->regs_allocated = REGS_UNALLOCATED;
757 /* Initialize the dfa. */
758 dfa = (re_dfa_t *) preg->buffer;
759 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
761 /* If zero allocated, but buffer is non-null, try to realloc
762 enough space. This loses if buffer's address is bogus, but
763 that is the user's responsibility. If ->buffer is NULL this
764 is a simple allocation. */
765 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
768 preg->allocated = sizeof (re_dfa_t);
769 preg->buffer = (unsigned char *) dfa;
771 preg->used = sizeof (re_dfa_t);
773 err = init_dfa (dfa, length);
774 if (BE (err != REG_NOERROR, 0))
776 free_dfa_content (dfa);
782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
783 dfa->re_str = re_malloc (char, length + 1);
784 strncpy (dfa->re_str, pattern, length + 1);
787 __libc_lock_init (dfa->lock);
789 err = re_string_construct (®exp, pattern, length, preg->translate,
790 syntax & RE_ICASE, dfa);
791 if (BE (err != REG_NOERROR, 0))
793 re_compile_internal_free_return:
794 free_workarea_compile (preg);
795 re_string_destruct (®exp);
796 free_dfa_content (dfa);
802 /* Parse the regular expression, and build a structure tree. */
804 dfa->str_tree = parse (®exp, preg, syntax, &err);
805 if (BE (dfa->str_tree == NULL, 0))
806 goto re_compile_internal_free_return;
808 /* Analyze the tree and create the nfa. */
809 err = analyze (preg);
810 if (BE (err != REG_NOERROR, 0))
811 goto re_compile_internal_free_return;
813 #ifdef RE_ENABLE_I18N
814 /* If possible, do searching in single byte encoding to speed things up. */
815 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
819 /* Then create the initial state of the dfa. */
820 err = create_initial_state (dfa);
822 /* Release work areas. */
823 free_workarea_compile (preg);
824 re_string_destruct (®exp);
826 if (BE (err != REG_NOERROR, 0))
828 free_dfa_content (dfa);
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
840 init_dfa (re_dfa_t *dfa, size_t pat_len)
842 unsigned int table_size;
847 memset (dfa, '\0', sizeof (re_dfa_t));
849 /* Force allocation of str_tree_storage the first time. */
850 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
852 /* Avoid overflows. */
853 if (pat_len == SIZE_MAX)
856 dfa->nodes_alloc = pat_len + 1;
857 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
859 /* table_size = 2 ^ ceil(log pat_len) */
860 for (table_size = 1; ; table_size <<= 1)
861 if (table_size > pat_len)
864 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
865 dfa->state_hash_mask = table_size - 1;
867 dfa->mb_cur_max = MB_CUR_MAX;
869 if (dfa->mb_cur_max == 6
870 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
872 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 # ifdef HAVE_LANGINFO_CODESET
876 codeset_name = nl_langinfo (CODESET);
878 codeset_name = getenv ("LC_ALL");
879 if (codeset_name == NULL || codeset_name[0] == '\0')
880 codeset_name = getenv ("LC_CTYPE");
881 if (codeset_name == NULL || codeset_name[0] == '\0')
882 codeset_name = getenv ("LANG");
883 if (codeset_name == NULL)
885 else if (strchr (codeset_name, '.') != NULL)
886 codeset_name = strchr (codeset_name, '.') + 1;
889 /* strcasecmp isn't a standard interface. brute force check */
891 if (strcasecmp (codeset_name, "UTF-8") == 0
892 || strcasecmp (codeset_name, "UTF8") == 0)
895 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
896 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
897 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
898 && (codeset_name[3] == '-'
899 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
900 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
904 /* We check exhaustively in the loop below if this charset is a
905 superset of ASCII. */
906 dfa->map_notascii = 0;
909 #ifdef RE_ENABLE_I18N
910 if (dfa->mb_cur_max > 1)
914 #if !defined(__GNUC__) || __GNUC__ < 3
915 static short utf8_sb_map_inited = 0;
917 if (! utf8_sb_map_inited)
921 utf8_sb_map_inited = 0;
922 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
923 utf8_sb_map[i] = BITSET_WORD_MAX;
926 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
932 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
933 if (BE (dfa->sb_char == NULL, 0))
936 /* Set the bits corresponding to single byte chars. */
937 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
938 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
940 wint_t wch = __btowc (ch);
942 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
944 if (isascii (ch) && wch != ch)
945 dfa->map_notascii = 1;
952 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
957 /* Initialize WORD_CHAR table, which indicate which character is
958 "word". In this case "word" means that it is the word construction
959 character used by some operators like "\<", "\>", etc. */
963 init_word_char (re_dfa_t *dfa)
966 dfa->word_ops_used = 1;
967 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
968 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
969 if (isalnum (ch) || ch == '_')
970 dfa->word_char[i] |= (bitset_word_t) 1 << j;
973 /* Free the work area which are only used while compiling. */
976 free_workarea_compile (regex_t *preg)
978 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
979 bin_tree_storage_t *storage, *next;
980 for (storage = dfa->str_tree_storage; storage; storage = next)
982 next = storage->next;
985 dfa->str_tree_storage = NULL;
986 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
987 dfa->str_tree = NULL;
988 re_free (dfa->org_indices);
989 dfa->org_indices = NULL;
992 /* Create initial states for all contexts. */
995 create_initial_state (re_dfa_t *dfa)
999 re_node_set init_nodes;
1001 /* Initial states have the epsilon closure of the node which is
1002 the first node of the regular expression. */
1003 first = dfa->str_tree->first->node_idx;
1004 dfa->init_node = first;
1005 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1006 if (BE (err != REG_NOERROR, 0))
1009 /* The back-references which are in initial states can epsilon transit,
1010 since in this case all of the subexpressions can be null.
1011 Then we add epsilon closures of the nodes which are the next nodes of
1012 the back-references. */
1013 if (dfa->nbackref > 0)
1014 for (i = 0; i < init_nodes.nelem; ++i)
1016 int node_idx = init_nodes.elems[i];
1017 re_token_type_t type = dfa->nodes[node_idx].type;
1020 if (type != OP_BACK_REF)
1022 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1024 re_token_t *clexp_node;
1025 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1026 if (clexp_node->type == OP_CLOSE_SUBEXP
1027 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030 if (clexp_idx == init_nodes.nelem)
1033 if (type == OP_BACK_REF)
1035 int dest_idx = dfa->edests[node_idx].elems[0];
1036 if (!re_node_set_contains (&init_nodes, dest_idx))
1038 reg_errcode_t err = re_node_set_merge (&init_nodes,
1041 if (err != REG_NOERROR)
1048 /* It must be the first time to invoke acquire_state. */
1049 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1050 /* We don't check ERR here, since the initial state must not be NULL. */
1051 if (BE (dfa->init_state == NULL, 0))
1053 if (dfa->init_state->has_constraint)
1055 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1063 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1064 || dfa->init_state_begbuf == NULL, 0))
1068 dfa->init_state_word = dfa->init_state_nl
1069 = dfa->init_state_begbuf = dfa->init_state;
1071 re_node_set_free (&init_nodes);
1075 #ifdef RE_ENABLE_I18N
1076 /* If it is possible to do searching in single byte encoding instead of UTF-8
1077 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1078 DFA nodes where needed. */
1081 optimize_utf8 (re_dfa_t *dfa)
1083 int node, i, mb_chars = 0, has_period = 0;
1085 for (node = 0; node < dfa->nodes_len; ++node)
1086 switch (dfa->nodes[node].type)
1089 if (dfa->nodes[node].opr.c >= 0x80)
1093 switch (dfa->nodes[node].opr.ctx_type)
1101 /* Word anchors etc. cannot be handled. It's okay to test
1102 opr.ctx_type since constraints (for all DFA nodes) are
1103 created by ORing one or more opr.ctx_type values. */
1113 case OP_DUP_ASTERISK:
1114 case OP_OPEN_SUBEXP:
1115 case OP_CLOSE_SUBEXP:
1117 case COMPLEX_BRACKET:
1119 case SIMPLE_BRACKET:
1120 /* Just double check. The non-ASCII range starts at 0x80. */
1121 assert (0x80 % BITSET_WORD_BITS == 0);
1122 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1123 if (dfa->nodes[node].opr.sbcset[i])
1130 if (mb_chars || has_period)
1131 for (node = 0; node < dfa->nodes_len; ++node)
1133 if (dfa->nodes[node].type == CHARACTER
1134 && dfa->nodes[node].opr.c >= 0x80)
1135 dfa->nodes[node].mb_partial = 0;
1136 else if (dfa->nodes[node].type == OP_PERIOD)
1137 dfa->nodes[node].type = OP_UTF8_PERIOD;
1140 /* The search can be in single byte locale. */
1141 dfa->mb_cur_max = 1;
1143 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1147 /* Analyze the structure tree, and calculate "first", "next", "edest",
1148 "eclosure", and "inveclosure". */
1150 static reg_errcode_t
1151 analyze (regex_t *preg)
1153 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1156 /* Allocate arrays. */
1157 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1158 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1159 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1160 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1161 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1162 || dfa->eclosures == NULL, 0))
1165 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1166 if (dfa->subexp_map != NULL)
1169 for (i = 0; i < preg->re_nsub; i++)
1170 dfa->subexp_map[i] = i;
1171 preorder (dfa->str_tree, optimize_subexps, dfa);
1172 for (i = 0; i < preg->re_nsub; i++)
1173 if (dfa->subexp_map[i] != i)
1175 if (i == preg->re_nsub)
1177 free (dfa->subexp_map);
1178 dfa->subexp_map = NULL;
1182 ret = postorder (dfa->str_tree, lower_subexps, preg);
1183 if (BE (ret != REG_NOERROR, 0))
1185 ret = postorder (dfa->str_tree, calc_first, dfa);
1186 if (BE (ret != REG_NOERROR, 0))
1188 preorder (dfa->str_tree, calc_next, dfa);
1189 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1190 if (BE (ret != REG_NOERROR, 0))
1192 ret = calc_eclosure (dfa);
1193 if (BE (ret != REG_NOERROR, 0))
1196 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1197 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1198 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1201 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1202 if (BE (dfa->inveclosures == NULL, 0))
1204 ret = calc_inveclosure (dfa);
1210 /* Our parse trees are very unbalanced, so we cannot use a stack to
1211 implement parse tree visits. Instead, we use parent pointers and
1212 some hairy code in these two functions. */
1213 static reg_errcode_t
1214 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1217 bin_tree_t *node, *prev;
1219 for (node = root; ; )
1221 /* Descend down the tree, preferably to the left (or to the right
1222 if that's the only child). */
1223 while (node->left || node->right)
1231 reg_errcode_t err = fn (extra, node);
1232 if (BE (err != REG_NOERROR, 0))
1234 if (node->parent == NULL)
1237 node = node->parent;
1239 /* Go up while we have a node that is reached from the right. */
1240 while (node->right == prev || node->right == NULL);
1245 static reg_errcode_t
1246 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1251 for (node = root; ; )
1253 reg_errcode_t err = fn (extra, node);
1254 if (BE (err != REG_NOERROR, 0))
1257 /* Go to the left node, or up and to the right. */
1262 bin_tree_t *prev = NULL;
1263 while (node->right == prev || node->right == NULL)
1266 node = node->parent;
1275 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1276 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1277 backreferences as well. Requires a preorder visit. */
1278 static reg_errcode_t
1279 optimize_subexps (void *extra, bin_tree_t *node)
1281 re_dfa_t *dfa = (re_dfa_t *) extra;
1283 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1285 int idx = node->token.opr.idx;
1286 node->token.opr.idx = dfa->subexp_map[idx];
1287 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1290 else if (node->token.type == SUBEXP
1291 && node->left && node->left->token.type == SUBEXP)
1293 int other_idx = node->left->token.opr.idx;
1295 node->left = node->left->left;
1297 node->left->parent = node;
1299 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1300 if (other_idx < BITSET_WORD_BITS)
1301 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1307 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1308 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1309 static reg_errcode_t
1310 lower_subexps (void *extra, bin_tree_t *node)
1312 regex_t *preg = (regex_t *) extra;
1313 reg_errcode_t err = REG_NOERROR;
1315 if (node->left && node->left->token.type == SUBEXP)
1317 node->left = lower_subexp (&err, preg, node->left);
1319 node->left->parent = node;
1321 if (node->right && node->right->token.type == SUBEXP)
1323 node->right = lower_subexp (&err, preg, node->right);
1325 node->right->parent = node;
1332 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1334 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1335 bin_tree_t *body = node->left;
1336 bin_tree_t *op, *cls, *tree1, *tree;
1339 /* We do not optimize empty subexpressions, because otherwise we may
1340 have bad CONCAT nodes with NULL children. This is obviously not
1341 very common, so we do not lose much. An example that triggers
1342 this case is the sed "script" /\(\)/x. */
1343 && node->left != NULL
1344 && (node->token.opr.idx >= BITSET_WORD_BITS
1345 || !(dfa->used_bkref_map
1346 & ((bitset_word_t) 1 << node->token.opr.idx))))
1349 /* Convert the SUBEXP node to the concatenation of an
1350 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1351 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1352 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1353 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1354 tree = create_tree (dfa, op, tree1, CONCAT);
1355 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1361 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1362 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1366 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1367 nodes. Requires a postorder visit. */
1368 static reg_errcode_t
1369 calc_first (void *extra, bin_tree_t *node)
1371 re_dfa_t *dfa = (re_dfa_t *) extra;
1372 if (node->token.type == CONCAT)
1374 node->first = node->left->first;
1375 node->node_idx = node->left->node_idx;
1380 node->node_idx = re_dfa_add_node (dfa, node->token);
1381 if (BE (node->node_idx == -1, 0))
1383 if (node->token.type == ANCHOR)
1384 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1389 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1390 static reg_errcode_t
1391 calc_next (void *extra, bin_tree_t *node)
1393 switch (node->token.type)
1395 case OP_DUP_ASTERISK:
1396 node->left->next = node;
1399 node->left->next = node->right->first;
1400 node->right->next = node->next;
1404 node->left->next = node->next;
1406 node->right->next = node->next;
1412 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1413 static reg_errcode_t
1414 link_nfa_nodes (void *extra, bin_tree_t *node)
1416 re_dfa_t *dfa = (re_dfa_t *) extra;
1417 int idx = node->node_idx;
1418 reg_errcode_t err = REG_NOERROR;
1420 switch (node->token.type)
1426 assert (node->next == NULL);
1429 case OP_DUP_ASTERISK:
1433 dfa->has_plural_match = 1;
1434 if (node->left != NULL)
1435 left = node->left->first->node_idx;
1437 left = node->next->node_idx;
1438 if (node->right != NULL)
1439 right = node->right->first->node_idx;
1441 right = node->next->node_idx;
1443 assert (right > -1);
1444 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1449 case OP_OPEN_SUBEXP:
1450 case OP_CLOSE_SUBEXP:
1451 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1455 dfa->nexts[idx] = node->next->node_idx;
1456 if (node->token.type == OP_BACK_REF)
1457 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1461 assert (!IS_EPSILON_NODE (node->token.type));
1462 dfa->nexts[idx] = node->next->node_idx;
1469 /* Duplicate the epsilon closure of the node ROOT_NODE.
1470 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1471 to their own constraint. */
1473 static reg_errcode_t
1475 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1476 int root_node, unsigned int init_constraint)
1478 int org_node, clone_node, ret;
1479 unsigned int constraint = init_constraint;
1480 for (org_node = top_org_node, clone_node = top_clone_node;;)
1482 int org_dest, clone_dest;
1483 if (dfa->nodes[org_node].type == OP_BACK_REF)
1485 /* If the back reference epsilon-transit, its destination must
1486 also have the constraint. Then duplicate the epsilon closure
1487 of the destination of the back reference, and store it in
1488 edests of the back reference. */
1489 org_dest = dfa->nexts[org_node];
1490 re_node_set_empty (dfa->edests + clone_node);
1491 clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 if (BE (clone_dest == -1, 0))
1494 dfa->nexts[clone_node] = dfa->nexts[org_node];
1495 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496 if (BE (ret < 0, 0))
1499 else if (dfa->edests[org_node].nelem == 0)
1501 /* In case of the node can't epsilon-transit, don't duplicate the
1502 destination and store the original destination as the
1503 destination of the node. */
1504 dfa->nexts[clone_node] = dfa->nexts[org_node];
1507 else if (dfa->edests[org_node].nelem == 1)
1509 /* In case of the node can epsilon-transit, and it has only one
1511 org_dest = dfa->edests[org_node].elems[0];
1512 re_node_set_empty (dfa->edests + clone_node);
1513 /* If the node is root_node itself, it means the epsilon clsoure
1514 has a loop. Then tie it to the destination of the root_node. */
1515 if (org_node == root_node && clone_node != org_node)
1517 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1518 if (BE (ret < 0, 0))
1522 /* In case of the node has another constraint, add it. */
1523 constraint |= dfa->nodes[org_node].constraint;
1524 clone_dest = duplicate_node (dfa, org_dest, constraint);
1525 if (BE (clone_dest == -1, 0))
1527 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528 if (BE (ret < 0, 0))
1531 else /* dfa->edests[org_node].nelem == 2 */
1533 /* In case of the node can epsilon-transit, and it has two
1534 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1535 org_dest = dfa->edests[org_node].elems[0];
1536 re_node_set_empty (dfa->edests + clone_node);
1537 /* Search for a duplicated node which satisfies the constraint. */
1538 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1539 if (clone_dest == -1)
1541 /* There is no such duplicated node, create a new one. */
1543 clone_dest = duplicate_node (dfa, org_dest, constraint);
1544 if (BE (clone_dest == -1, 0))
1546 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547 if (BE (ret < 0, 0))
1549 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1550 root_node, constraint);
1551 if (BE (err != REG_NOERROR, 0))
1556 /* There is a duplicated node which satisfies the constraint,
1557 use it to avoid infinite loop. */
1558 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 if (BE (ret < 0, 0))
1563 org_dest = dfa->edests[org_node].elems[1];
1564 clone_dest = duplicate_node (dfa, org_dest, constraint);
1565 if (BE (clone_dest == -1, 0))
1567 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568 if (BE (ret < 0, 0))
1571 org_node = org_dest;
1572 clone_node = clone_dest;
1577 /* Search for a node which is duplicated from the node ORG_NODE, and
1578 satisfies the constraint CONSTRAINT. */
1581 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1582 unsigned int constraint)
1585 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1587 if (org_node == dfa->org_indices[idx]
1588 && constraint == dfa->nodes[idx].constraint)
1589 return idx; /* Found. */
1591 return -1; /* Not found. */
1594 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1595 Return the index of the new node, or -1 if insufficient storage is
1599 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1601 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1602 if (BE (dup_idx != -1, 1))
1604 dfa->nodes[dup_idx].constraint = constraint;
1605 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1606 dfa->nodes[dup_idx].duplicated = 1;
1608 /* Store the index of the original node. */
1609 dfa->org_indices[dup_idx] = org_idx;
1614 static reg_errcode_t
1615 calc_inveclosure (re_dfa_t *dfa)
1618 for (idx = 0; idx < dfa->nodes_len; ++idx)
1619 re_node_set_init_empty (dfa->inveclosures + idx);
1621 for (src = 0; src < dfa->nodes_len; ++src)
1623 int *elems = dfa->eclosures[src].elems;
1624 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1626 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1627 if (BE (ret == -1, 0))
1635 /* Calculate "eclosure" for all the node in DFA. */
1637 static reg_errcode_t
1638 calc_eclosure (re_dfa_t *dfa)
1640 int node_idx, incomplete;
1642 assert (dfa->nodes_len > 0);
1645 /* For each nodes, calculate epsilon closure. */
1646 for (node_idx = 0; ; ++node_idx)
1649 re_node_set eclosure_elem;
1650 if (node_idx == dfa->nodes_len)
1659 assert (dfa->eclosures[node_idx].nelem != -1);
1662 /* If we have already calculated, skip it. */
1663 if (dfa->eclosures[node_idx].nelem != 0)
1665 /* Calculate epsilon closure of `node_idx'. */
1666 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1667 if (BE (err != REG_NOERROR, 0))
1670 if (dfa->eclosures[node_idx].nelem == 0)
1673 re_node_set_free (&eclosure_elem);
1679 /* Calculate epsilon closure of NODE. */
1681 static reg_errcode_t
1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1686 re_node_set eclosure;
1689 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690 if (BE (err != REG_NOERROR, 0))
1693 /* This indicates that we are calculating this node now.
1694 We reference this value to avoid infinite loop. */
1695 dfa->eclosures[node].nelem = -1;
1697 /* If the current node has constraints, duplicate all nodes
1698 since they must inherit the constraints. */
1699 if (dfa->nodes[node].constraint
1700 && dfa->edests[node].nelem
1701 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1703 err = duplicate_node_closure (dfa, node, node, node,
1704 dfa->nodes[node].constraint);
1705 if (BE (err != REG_NOERROR, 0))
1709 /* Expand each epsilon destination nodes. */
1710 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711 for (i = 0; i < dfa->edests[node].nelem; ++i)
1713 re_node_set eclosure_elem;
1714 int edest = dfa->edests[node].elems[i];
1715 /* If calculating the epsilon closure of `edest' is in progress,
1716 return intermediate result. */
1717 if (dfa->eclosures[edest].nelem == -1)
1722 /* If we haven't calculated the epsilon closure of `edest' yet,
1723 calculate now. Otherwise use calculated epsilon closure. */
1724 if (dfa->eclosures[edest].nelem == 0)
1726 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1727 if (BE (err != REG_NOERROR, 0))
1731 eclosure_elem = dfa->eclosures[edest];
1732 /* Merge the epsilon closure of `edest'. */
1733 err = re_node_set_merge (&eclosure, &eclosure_elem);
1734 if (BE (err != REG_NOERROR, 0))
1736 /* If the epsilon closure of `edest' is incomplete,
1737 the epsilon closure of this node is also incomplete. */
1738 if (dfa->eclosures[edest].nelem == 0)
1741 re_node_set_free (&eclosure_elem);
1745 /* An epsilon closure includes itself. */
1746 ret = re_node_set_insert (&eclosure, node);
1747 if (BE (ret < 0, 0))
1749 if (incomplete && !root)
1750 dfa->eclosures[node].nelem = 0;
1752 dfa->eclosures[node] = eclosure;
1753 *new_set = eclosure;
1757 /* Functions for token which are used in the parser. */
1759 /* Fetch a token from INPUT.
1760 We must not use this function inside bracket expressions. */
1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1766 re_string_skip_bytes (input, peek_token (result, input, syntax));
1769 /* Peek a token from INPUT, and return the length of the token.
1770 We must not use this function inside bracket expressions. */
1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1778 if (re_string_eoi (input))
1780 token->type = END_OF_RE;
1784 c = re_string_peek_byte (input, 0);
1787 token->word_char = 0;
1788 #ifdef RE_ENABLE_I18N
1789 token->mb_partial = 0;
1790 if (input->mb_cur_max > 1 &&
1791 !re_string_first_byte (input, re_string_cur_idx (input)))
1793 token->type = CHARACTER;
1794 token->mb_partial = 1;
1801 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1803 token->type = BACK_SLASH;
1807 c2 = re_string_peek_byte_case (input, 1);
1809 token->type = CHARACTER;
1810 #ifdef RE_ENABLE_I18N
1811 if (input->mb_cur_max > 1)
1813 wint_t wc = re_string_wchar_at (input,
1814 re_string_cur_idx (input) + 1);
1815 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1819 token->word_char = IS_WORD_CHAR (c2) != 0;
1824 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825 token->type = OP_ALT;
1827 case '1': case '2': case '3': case '4': case '5':
1828 case '6': case '7': case '8': case '9':
1829 if (!(syntax & RE_NO_BK_REFS))
1831 token->type = OP_BACK_REF;
1832 token->opr.idx = c2 - '1';
1836 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = WORD_FIRST;
1843 if (!(syntax & RE_NO_GNU_OPS))
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_LAST;
1850 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_DELIM;
1857 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = NOT_WORD_DELIM;
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 token->type = OP_WORD;
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = OP_NOTWORD;
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_SPACE;
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_NOTSPACE;
1880 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = BUF_FIRST;
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = BUF_LAST;
1894 if (!(syntax & RE_NO_BK_PARENS))
1895 token->type = OP_OPEN_SUBEXP;
1898 if (!(syntax & RE_NO_BK_PARENS))
1899 token->type = OP_CLOSE_SUBEXP;
1902 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903 token->type = OP_DUP_PLUS;
1906 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907 token->type = OP_DUP_QUESTION;
1910 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911 token->type = OP_OPEN_DUP_NUM;
1914 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915 token->type = OP_CLOSE_DUP_NUM;
1923 token->type = CHARACTER;
1924 #ifdef RE_ENABLE_I18N
1925 if (input->mb_cur_max > 1)
1927 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1932 token->word_char = IS_WORD_CHAR (token->opr.c);
1937 if (syntax & RE_NEWLINE_ALT)
1938 token->type = OP_ALT;
1941 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942 token->type = OP_ALT;
1945 token->type = OP_DUP_ASTERISK;
1948 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949 token->type = OP_DUP_PLUS;
1952 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953 token->type = OP_DUP_QUESTION;
1956 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957 token->type = OP_OPEN_DUP_NUM;
1960 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961 token->type = OP_CLOSE_DUP_NUM;
1964 if (syntax & RE_NO_BK_PARENS)
1965 token->type = OP_OPEN_SUBEXP;
1968 if (syntax & RE_NO_BK_PARENS)
1969 token->type = OP_CLOSE_SUBEXP;
1972 token->type = OP_OPEN_BRACKET;
1975 token->type = OP_PERIOD;
1978 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979 re_string_cur_idx (input) != 0)
1981 char prev = re_string_peek_byte (input, -1);
1982 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1985 token->type = ANCHOR;
1986 token->opr.ctx_type = LINE_FIRST;
1989 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990 re_string_cur_idx (input) + 1 != re_string_length (input))
1993 re_string_skip_bytes (input, 1);
1994 peek_token (&next, input, syntax);
1995 re_string_skip_bytes (input, -1);
1996 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1999 token->type = ANCHOR;
2000 token->opr.ctx_type = LINE_LAST;
2008 /* Peek a token from INPUT, and return the length of the token.
2009 We must not use this function out of bracket expressions. */
2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2016 if (re_string_eoi (input))
2018 token->type = END_OF_RE;
2021 c = re_string_peek_byte (input, 0);
2024 #ifdef RE_ENABLE_I18N
2025 if (input->mb_cur_max > 1 &&
2026 !re_string_first_byte (input, re_string_cur_idx (input)))
2028 token->type = CHARACTER;
2031 #endif /* RE_ENABLE_I18N */
2033 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034 && re_string_cur_idx (input) + 1 < re_string_length (input))
2036 /* In this case, '\' escape a character. */
2038 re_string_skip_bytes (input, 1);
2039 c2 = re_string_peek_byte (input, 0);
2041 token->type = CHARACTER;
2044 if (c == '[') /* '[' is a special char in a bracket exps. */
2048 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049 c2 = re_string_peek_byte (input, 1);
2057 token->type = OP_OPEN_COLL_ELEM;
2060 token->type = OP_OPEN_EQUIV_CLASS;
2063 if (syntax & RE_CHAR_CLASSES)
2065 token->type = OP_OPEN_CHAR_CLASS;
2068 /* else fall through. */
2070 token->type = CHARACTER;
2080 token->type = OP_CHARSET_RANGE;
2083 token->type = OP_CLOSE_BRACKET;
2086 token->type = OP_NON_MATCH_LIST;
2089 token->type = CHARACTER;
2094 /* Functions for parser. */
2096 /* Entry point of the parser.
2097 Parse the regular expression REGEXP and return the structure tree.
2098 If an error has occurred, ERR is set by error code, and return NULL.
2099 This function build the following tree, from regular expression <reg_exp>:
2105 CAT means concatenation.
2106 EOR means end of regular expression. */
2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2112 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113 bin_tree_t *tree, *eor, *root;
2114 re_token_t current_token;
2115 dfa->syntax = syntax;
2116 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2118 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2120 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2122 root = create_tree (dfa, tree, eor, CONCAT);
2125 if (BE (eor == NULL || root == NULL, 0))
2133 /* This function build the following tree, from regular expression
2134 <branch1>|<branch2>:
2140 ALT means alternative, which represents the operator `|'. */
2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2146 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147 bin_tree_t *tree, *branch = NULL;
2148 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2152 while (token->type == OP_ALT)
2154 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155 if (token->type != OP_ALT && token->type != END_OF_RE
2156 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2158 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2164 tree = create_tree (dfa, tree, branch, OP_ALT);
2165 if (BE (tree == NULL, 0))
2174 /* This function build the following tree, from regular expression
2181 CAT means concatenation. */
2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2187 bin_tree_t *tree, *exp;
2188 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193 while (token->type != OP_ALT && token->type != END_OF_RE
2194 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2196 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2197 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2201 if (tree != NULL && exp != NULL)
2203 tree = create_tree (dfa, tree, exp, CONCAT);
2210 else if (tree == NULL)
2212 /* Otherwise exp == NULL, we don't need to create new tree. */
2217 /* This function build the following tree, from regular expression a*:
2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2229 switch (token->type)
2232 tree = create_token_tree (dfa, NULL, NULL, token);
2233 if (BE (tree == NULL, 0))
2238 #ifdef RE_ENABLE_I18N
2239 if (dfa->mb_cur_max > 1)
2241 while (!re_string_eoi (regexp)
2242 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2244 bin_tree_t *mbc_remain;
2245 fetch_token (token, regexp, syntax);
2246 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248 if (BE (mbc_remain == NULL || tree == NULL, 0))
2257 case OP_OPEN_SUBEXP:
2258 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262 case OP_OPEN_BRACKET:
2263 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2268 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2273 dfa->used_bkref_map |= 1 << token->opr.idx;
2274 tree = create_token_tree (dfa, NULL, NULL, token);
2275 if (BE (tree == NULL, 0))
2281 dfa->has_mb_node = 1;
2283 case OP_OPEN_DUP_NUM:
2284 if (syntax & RE_CONTEXT_INVALID_DUP)
2290 case OP_DUP_ASTERISK:
2292 case OP_DUP_QUESTION:
2293 if (syntax & RE_CONTEXT_INVALID_OPS)
2298 else if (syntax & RE_CONTEXT_INDEP_OPS)
2300 fetch_token (token, regexp, syntax);
2301 return parse_expression (regexp, preg, token, syntax, nest, err);
2303 /* else fall through */
2304 case OP_CLOSE_SUBEXP:
2305 if ((token->type == OP_CLOSE_SUBEXP) &&
2306 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2311 /* else fall through */
2312 case OP_CLOSE_DUP_NUM:
2313 /* We treat it as a normal character. */
2315 /* Then we can these characters as normal characters. */
2316 token->type = CHARACTER;
2317 /* mb_partial and word_char bits should be initialized already
2319 tree = create_token_tree (dfa, NULL, NULL, token);
2320 if (BE (tree == NULL, 0))
2327 if ((token->opr.ctx_type
2328 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329 && dfa->word_ops_used == 0)
2330 init_word_char (dfa);
2331 if (token->opr.ctx_type == WORD_DELIM
2332 || token->opr.ctx_type == NOT_WORD_DELIM)
2334 bin_tree_t *tree_first, *tree_last;
2335 if (token->opr.ctx_type == WORD_DELIM)
2337 token->opr.ctx_type = WORD_FIRST;
2338 tree_first = create_token_tree (dfa, NULL, NULL, token);
2339 token->opr.ctx_type = WORD_LAST;
2343 token->opr.ctx_type = INSIDE_WORD;
2344 tree_first = create_token_tree (dfa, NULL, NULL, token);
2345 token->opr.ctx_type = INSIDE_NOTWORD;
2347 tree_last = create_token_tree (dfa, NULL, NULL, token);
2348 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2357 tree = create_token_tree (dfa, NULL, NULL, token);
2358 if (BE (tree == NULL, 0))
2364 /* We must return here, since ANCHORs can't be followed
2365 by repetition operators.
2366 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2368 fetch_token (token, regexp, syntax);
2371 tree = create_token_tree (dfa, NULL, NULL, token);
2372 if (BE (tree == NULL, 0))
2377 if (dfa->mb_cur_max > 1)
2378 dfa->has_mb_node = 1;
2382 tree = build_charclass_op (dfa, regexp->trans,
2385 token->type == OP_NOTWORD, err);
2386 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2391 tree = build_charclass_op (dfa, regexp->trans,
2394 token->type == OP_NOTSPACE, err);
2395 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2405 /* Must not happen? */
2411 fetch_token (token, regexp, syntax);
2413 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2416 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2419 /* In BRE consecutive duplications are not allowed. */
2420 if ((syntax & RE_CONTEXT_INVALID_DUP)
2421 && (token->type == OP_DUP_ASTERISK
2422 || token->type == OP_OPEN_DUP_NUM))
2432 /* This function build the following tree, from regular expression
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2443 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2446 cur_nsub = preg->re_nsub++;
2448 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450 /* The subexpression may be a null string. */
2451 if (token->type == OP_CLOSE_SUBEXP)
2455 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2458 if (BE (*err != REG_NOERROR, 0))
2462 if (cur_nsub <= '9' - '1')
2463 dfa->completed_bkref_map |= 1 << cur_nsub;
2465 tree = create_tree (dfa, tree, NULL, SUBEXP);
2466 if (BE (tree == NULL, 0))
2471 tree->token.opr.idx = cur_nsub;
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2481 bin_tree_t *tree = NULL, *old_tree = NULL;
2482 int i, start, end, start_idx = re_string_cur_idx (regexp);
2483 #ifndef RE_TOKEN_INIT_BUG
2484 re_token_t start_token = *token;
2486 re_token_t start_token;
2488 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2491 if (token->type == OP_OPEN_DUP_NUM)
2494 start = fetch_number (regexp, token, syntax);
2497 if (token->type == CHARACTER && token->opr.c == ',')
2498 start = 0; /* We treat "{,m}" as "{0,m}". */
2501 *err = REG_BADBR; /* <re>{} is invalid. */
2505 if (BE (start != -2, 1))
2507 /* We treat "{n}" as "{n,n}". */
2508 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509 : ((token->type == CHARACTER && token->opr.c == ',')
2510 ? fetch_number (regexp, token, syntax) : -2));
2512 if (BE (start == -2 || end == -2, 0))
2514 /* Invalid sequence. */
2515 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2517 if (token->type == END_OF_RE)
2525 /* If the syntax bit is set, rollback. */
2526 re_string_set_index (regexp, start_idx);
2527 *token = start_token;
2528 token->type = CHARACTER;
2529 /* mb_partial and word_char bits should be already initialized by
2534 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2536 /* First number greater than second. */
2543 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2544 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2547 fetch_token (token, regexp, syntax);
2549 if (BE (elem == NULL, 0))
2551 if (BE (start == 0 && end == 0, 0))
2553 postorder (elem, free_tree, NULL);
2557 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2558 if (BE (start > 0, 0))
2561 for (i = 2; i <= start; ++i)
2563 elem = duplicate_tree (elem, dfa);
2564 tree = create_tree (dfa, tree, elem, CONCAT);
2565 if (BE (elem == NULL || tree == NULL, 0))
2566 goto parse_dup_op_espace;
2572 /* Duplicate ELEM before it is marked optional. */
2573 elem = duplicate_tree (elem, dfa);
2579 if (elem->token.type == SUBEXP)
2580 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2582 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2583 if (BE (tree == NULL, 0))
2584 goto parse_dup_op_espace;
2586 /* This loop is actually executed only when end != -1,
2587 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2588 already created the start+1-th copy. */
2589 for (i = start + 2; i <= end; ++i)
2591 elem = duplicate_tree (elem, dfa);
2592 tree = create_tree (dfa, tree, elem, CONCAT);
2593 if (BE (elem == NULL || tree == NULL, 0))
2594 goto parse_dup_op_espace;
2596 tree = create_tree (dfa, tree, NULL, OP_ALT);
2597 if (BE (tree == NULL, 0))
2598 goto parse_dup_op_espace;
2602 tree = create_tree (dfa, old_tree, tree, CONCAT);
2606 parse_dup_op_espace:
2611 /* Size of the names for collating symbol/equivalence_class/character_class.
2612 I'm not sure, but maybe enough. */
2613 #define BRACKET_NAME_BUF_SIZE 32
2616 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2617 Build the range expression which starts from START_ELEM, and ends
2618 at END_ELEM. The result are written to MBCSET and SBCSET.
2619 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2620 mbcset->range_ends, is a pointer argument since we may
2623 static reg_errcode_t
2625 # ifdef RE_ENABLE_I18N
2626 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2627 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2628 # else /* not RE_ENABLE_I18N */
2629 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2630 bracket_elem_t *end_elem)
2631 # endif /* not RE_ENABLE_I18N */
2633 unsigned int start_ch, end_ch;
2634 /* Equivalence Classes and Character Classes can't be a range start/end. */
2635 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2636 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2640 /* We can handle no multi character collating elements without libc
2642 if (BE ((start_elem->type == COLL_SYM
2643 && strlen ((char *) start_elem->opr.name) > 1)
2644 || (end_elem->type == COLL_SYM
2645 && strlen ((char *) end_elem->opr.name) > 1), 0))
2646 return REG_ECOLLATE;
2648 # ifdef RE_ENABLE_I18N
2653 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2655 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2656 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2658 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2659 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2663 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2664 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2665 * unsigned, so we don't have sign extension problems.
2667 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2668 ? start_ch : start_elem->opr.wch);
2669 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2670 ? end_ch : end_elem->opr.wch);
2672 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2673 ? __btowc (start_ch) : start_elem->opr.wch);
2674 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2675 ? __btowc (end_ch) : end_elem->opr.wch);
2677 if (start_wc == WEOF || end_wc == WEOF)
2678 return REG_ECOLLATE;
2679 cmp_buf[0] = start_wc;
2680 cmp_buf[4] = end_wc;
2681 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2684 /* Got valid collation sequence values, add them as a new entry.
2685 However, for !_LIBC we have no collation elements: if the
2686 character set is single byte, the single byte character set
2687 that we build below suffices. parse_bracket_exp passes
2688 no MBCSET if dfa->mb_cur_max == 1. */
2691 /* Check the space of the arrays. */
2692 if (BE (*range_alloc == mbcset->nranges, 0))
2694 /* There is not enough space, need realloc. */
2695 wchar_t *new_array_start, *new_array_end;
2698 /* +1 in case of mbcset->nranges is 0. */
2699 new_nranges = 2 * mbcset->nranges + 1;
2700 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2701 are NULL if *range_alloc == 0. */
2702 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2704 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2707 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2710 mbcset->range_starts = new_array_start;
2711 mbcset->range_ends = new_array_end;
2712 *range_alloc = new_nranges;
2715 mbcset->range_starts[mbcset->nranges] = start_wc;
2716 mbcset->range_ends[mbcset->nranges++] = end_wc;
2719 /* Build the table for single byte characters. */
2720 for (wc = 0; wc < SBC_MAX; ++wc)
2723 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2724 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2725 bitset_set (sbcset, wc);
2728 # else /* not RE_ENABLE_I18N */
2731 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2732 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2734 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2735 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2737 if (start_ch > end_ch)
2739 /* Build the table for single byte characters. */
2740 for (ch = 0; ch < SBC_MAX; ++ch)
2741 if (start_ch <= ch && ch <= end_ch)
2742 bitset_set (sbcset, ch);
2744 # endif /* not RE_ENABLE_I18N */
2747 #endif /* not _LIBC */
2750 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2751 Build the collating element which is represented by NAME.
2752 The result are written to MBCSET and SBCSET.
2753 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2754 pointer argument since we may update it. */
2756 static reg_errcode_t
2758 # ifdef RE_ENABLE_I18N
2759 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2760 int *coll_sym_alloc, const unsigned char *name)
2761 # else /* not RE_ENABLE_I18N */
2762 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2763 # endif /* not RE_ENABLE_I18N */
2765 size_t name_len = strlen ((const char *) name);
2766 if (BE (name_len != 1, 0))
2767 return REG_ECOLLATE;
2770 bitset_set (sbcset, name[0]);
2774 #endif /* not _LIBC */
2776 /* This function parse bracket expression like "[abc]", "[a-c]",
2780 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2781 reg_syntax_t syntax, reg_errcode_t *err)
2784 const unsigned char *collseqmb;
2785 const char *collseqwc;
2788 const int32_t *symb_table;
2789 const unsigned char *extra;
2791 /* Local function for parse_bracket_exp used in _LIBC environment.
2792 Seek the collating symbol entry correspondings to NAME.
2793 Return the index of the symbol in the SYMB_TABLE. */
2796 __attribute ((always_inline))
2797 seek_collating_symbol_entry (name, name_len)
2798 const unsigned char *name;
2801 int32_t hash = elem_hash ((const char *) name, name_len);
2802 int32_t elem = hash % table_size;
2803 if (symb_table[2 * elem] != 0)
2805 int32_t second = hash % (table_size - 2) + 1;
2809 /* First compare the hashing value. */
2810 if (symb_table[2 * elem] == hash
2811 /* Compare the length of the name. */
2812 && name_len == extra[symb_table[2 * elem + 1]]
2813 /* Compare the name. */
2814 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2817 /* Yep, this is the entry. */
2824 while (symb_table[2 * elem] != 0);
2829 /* Local function for parse_bracket_exp used in _LIBC environment.
2830 Look up the collation sequence value of BR_ELEM.
2831 Return the value if succeeded, UINT_MAX otherwise. */
2833 auto inline unsigned int
2834 __attribute ((always_inline))
2835 lookup_collation_sequence_value (br_elem)
2836 bracket_elem_t *br_elem;
2838 if (br_elem->type == SB_CHAR)
2841 if (MB_CUR_MAX == 1)
2844 return collseqmb[br_elem->opr.ch];
2847 wint_t wc = __btowc (br_elem->opr.ch);
2848 return __collseq_table_lookup (collseqwc, wc);
2851 else if (br_elem->type == MB_CHAR)
2854 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2856 else if (br_elem->type == COLL_SYM)
2858 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2862 elem = seek_collating_symbol_entry (br_elem->opr.name,
2864 if (symb_table[2 * elem] != 0)
2866 /* We found the entry. */
2867 idx = symb_table[2 * elem + 1];
2868 /* Skip the name of collating element name. */
2869 idx += 1 + extra[idx];
2870 /* Skip the byte sequence of the collating element. */
2871 idx += 1 + extra[idx];
2872 /* Adjust for the alignment. */
2873 idx = (idx + 3) & ~3;
2874 /* Skip the multibyte collation sequence value. */
2875 idx += sizeof (unsigned int);
2876 /* Skip the wide char sequence of the collating element. */
2877 idx += sizeof (unsigned int) *
2878 (1 + *(unsigned int *) (extra + idx));
2879 /* Return the collation sequence value. */
2880 return *(unsigned int *) (extra + idx);
2882 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2884 /* No valid character. Match it as a single byte
2886 return collseqmb[br_elem->opr.name[0]];
2889 else if (sym_name_len == 1)
2890 return collseqmb[br_elem->opr.name[0]];
2895 /* Local function for parse_bracket_exp used in _LIBC environment.
2896 Build the range expression which starts from START_ELEM, and ends
2897 at END_ELEM. The result are written to MBCSET and SBCSET.
2898 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2899 mbcset->range_ends, is a pointer argument since we may
2902 auto inline reg_errcode_t
2903 __attribute ((always_inline))
2904 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2905 re_charset_t *mbcset;
2908 bracket_elem_t *start_elem, *end_elem;
2911 uint32_t start_collseq;
2912 uint32_t end_collseq;
2914 /* Equivalence Classes and Character Classes can't be a range
2916 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2917 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2921 start_collseq = lookup_collation_sequence_value (start_elem);
2922 end_collseq = lookup_collation_sequence_value (end_elem);
2923 /* Check start/end collation sequence values. */
2924 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2925 return REG_ECOLLATE;
2926 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2929 /* Got valid collation sequence values, add them as a new entry.
2930 However, if we have no collation elements, and the character set
2931 is single byte, the single byte character set that we
2932 build below suffices. */
2933 if (nrules > 0 || dfa->mb_cur_max > 1)
2935 /* Check the space of the arrays. */
2936 if (BE (*range_alloc == mbcset->nranges, 0))
2938 /* There is not enough space, need realloc. */
2939 uint32_t *new_array_start;
2940 uint32_t *new_array_end;
2943 /* +1 in case of mbcset->nranges is 0. */
2944 new_nranges = 2 * mbcset->nranges + 1;
2945 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2947 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2950 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2953 mbcset->range_starts = new_array_start;
2954 mbcset->range_ends = new_array_end;
2955 *range_alloc = new_nranges;
2958 mbcset->range_starts[mbcset->nranges] = start_collseq;
2959 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2962 /* Build the table for single byte characters. */
2963 for (ch = 0; ch < SBC_MAX; ch++)
2965 uint32_t ch_collseq;
2967 if (MB_CUR_MAX == 1)
2970 ch_collseq = collseqmb[ch];
2972 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2973 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2974 bitset_set (sbcset, ch);
2979 /* Local function for parse_bracket_exp used in _LIBC environment.
2980 Build the collating element which is represented by NAME.
2981 The result are written to MBCSET and SBCSET.
2982 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2983 pointer argument since we may update it. */
2985 auto inline reg_errcode_t
2986 __attribute ((always_inline))
2987 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2988 re_charset_t *mbcset;
2989 int *coll_sym_alloc;
2991 const unsigned char *name;
2994 size_t name_len = strlen ((const char *) name);
2997 elem = seek_collating_symbol_entry (name, name_len);
2998 if (symb_table[2 * elem] != 0)
3000 /* We found the entry. */
3001 idx = symb_table[2 * elem + 1];
3002 /* Skip the name of collating element name. */
3003 idx += 1 + extra[idx];
3005 else if (symb_table[2 * elem] == 0 && name_len == 1)
3007 /* No valid character, treat it as a normal
3009 bitset_set (sbcset, name[0]);
3013 return REG_ECOLLATE;
3015 /* Got valid collation sequence, add it as a new entry. */
3016 /* Check the space of the arrays. */
3017 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3019 /* Not enough, realloc it. */
3020 /* +1 in case of mbcset->ncoll_syms is 0. */
3021 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3022 /* Use realloc since mbcset->coll_syms is NULL
3024 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3025 new_coll_sym_alloc);
3026 if (BE (new_coll_syms == NULL, 0))
3028 mbcset->coll_syms = new_coll_syms;
3029 *coll_sym_alloc = new_coll_sym_alloc;
3031 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3036 if (BE (name_len != 1, 0))
3037 return REG_ECOLLATE;
3040 bitset_set (sbcset, name[0]);
3047 re_token_t br_token;
3048 re_bitset_ptr_t sbcset;
3049 #ifdef RE_ENABLE_I18N
3050 re_charset_t *mbcset;
3051 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3052 int equiv_class_alloc = 0, char_class_alloc = 0;
3053 #endif /* not RE_ENABLE_I18N */
3055 bin_tree_t *work_tree;
3057 int first_round = 1;
3059 collseqmb = (const unsigned char *)
3060 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3061 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3067 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3068 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3069 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3070 _NL_COLLATE_SYMB_TABLEMB);
3071 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3072 _NL_COLLATE_SYMB_EXTRAMB);
3075 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3076 #ifdef RE_ENABLE_I18N
3077 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3078 #endif /* RE_ENABLE_I18N */
3079 #ifdef RE_ENABLE_I18N
3080 if (BE (sbcset == NULL || mbcset == NULL, 0))
3082 if (BE (sbcset == NULL, 0))
3083 #endif /* RE_ENABLE_I18N */
3089 token_len = peek_token_bracket (token, regexp, syntax);
3090 if (BE (token->type == END_OF_RE, 0))
3093 goto parse_bracket_exp_free_return;
3095 if (token->type == OP_NON_MATCH_LIST)
3097 #ifdef RE_ENABLE_I18N
3098 mbcset->non_match = 1;
3099 #endif /* not RE_ENABLE_I18N */
3101 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3102 bitset_set (sbcset, '\n');
3103 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3104 token_len = peek_token_bracket (token, regexp, syntax);
3105 if (BE (token->type == END_OF_RE, 0))
3108 goto parse_bracket_exp_free_return;
3112 /* We treat the first ']' as a normal character. */
3113 if (token->type == OP_CLOSE_BRACKET)
3114 token->type = CHARACTER;
3118 bracket_elem_t start_elem, end_elem;
3119 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3120 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3122 int token_len2 = 0, is_range_exp = 0;
3125 start_elem.opr.name = start_name_buf;
3126 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3127 syntax, first_round);
3128 if (BE (ret != REG_NOERROR, 0))
3131 goto parse_bracket_exp_free_return;
3135 /* Get information about the next token. We need it in any case. */
3136 token_len = peek_token_bracket (token, regexp, syntax);
3138 /* Do not check for ranges if we know they are not allowed. */
3139 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3141 if (BE (token->type == END_OF_RE, 0))
3144 goto parse_bracket_exp_free_return;
3146 if (token->type == OP_CHARSET_RANGE)
3148 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3149 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3150 if (BE (token2.type == END_OF_RE, 0))
3153 goto parse_bracket_exp_free_return;
3155 if (token2.type == OP_CLOSE_BRACKET)
3157 /* We treat the last '-' as a normal character. */
3158 re_string_skip_bytes (regexp, -token_len);
3159 token->type = CHARACTER;
3166 if (is_range_exp == 1)
3168 end_elem.opr.name = end_name_buf;
3169 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3171 if (BE (ret != REG_NOERROR, 0))
3174 goto parse_bracket_exp_free_return;
3177 token_len = peek_token_bracket (token, regexp, syntax);
3180 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3181 &start_elem, &end_elem);
3183 # ifdef RE_ENABLE_I18N
3184 *err = build_range_exp (sbcset,
3185 dfa->mb_cur_max > 1 ? mbcset : NULL,
3186 &range_alloc, &start_elem, &end_elem);
3188 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3190 #endif /* RE_ENABLE_I18N */
3191 if (BE (*err != REG_NOERROR, 0))
3192 goto parse_bracket_exp_free_return;
3196 switch (start_elem.type)
3199 bitset_set (sbcset, start_elem.opr.ch);
3201 #ifdef RE_ENABLE_I18N
3203 /* Check whether the array has enough space. */
3204 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3206 wchar_t *new_mbchars;
3207 /* Not enough, realloc it. */
3208 /* +1 in case of mbcset->nmbchars is 0. */
3209 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3210 /* Use realloc since array is NULL if *alloc == 0. */
3211 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3213 if (BE (new_mbchars == NULL, 0))
3214 goto parse_bracket_exp_espace;
3215 mbcset->mbchars = new_mbchars;
3217 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3219 #endif /* RE_ENABLE_I18N */
3221 *err = build_equiv_class (sbcset,
3222 #ifdef RE_ENABLE_I18N
3223 mbcset, &equiv_class_alloc,
3224 #endif /* RE_ENABLE_I18N */
3225 start_elem.opr.name);
3226 if (BE (*err != REG_NOERROR, 0))
3227 goto parse_bracket_exp_free_return;
3230 *err = build_collating_symbol (sbcset,
3231 #ifdef RE_ENABLE_I18N
3232 mbcset, &coll_sym_alloc,
3233 #endif /* RE_ENABLE_I18N */
3234 start_elem.opr.name);
3235 if (BE (*err != REG_NOERROR, 0))
3236 goto parse_bracket_exp_free_return;
3239 *err = build_charclass (regexp->trans, sbcset,
3240 #ifdef RE_ENABLE_I18N
3241 mbcset, &char_class_alloc,
3242 #endif /* RE_ENABLE_I18N */
3243 (const char *) start_elem.opr.name, syntax);
3244 if (BE (*err != REG_NOERROR, 0))
3245 goto parse_bracket_exp_free_return;
3252 if (BE (token->type == END_OF_RE, 0))
3255 goto parse_bracket_exp_free_return;
3257 if (token->type == OP_CLOSE_BRACKET)
3261 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3263 /* If it is non-matching list. */
3265 bitset_not (sbcset);
3267 #ifdef RE_ENABLE_I18N
3268 /* Ensure only single byte characters are set. */
3269 if (dfa->mb_cur_max > 1)
3270 bitset_mask (sbcset, dfa->sb_char);
3272 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3273 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3274 || mbcset->non_match)))
3276 bin_tree_t *mbc_tree;
3278 /* Build a tree for complex bracket. */
3279 dfa->has_mb_node = 1;
3280 br_token.type = COMPLEX_BRACKET;
3281 br_token.opr.mbcset = mbcset;
3282 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3283 if (BE (mbc_tree == NULL, 0))
3284 goto parse_bracket_exp_espace;
3285 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3286 if (sbcset[sbc_idx])
3288 /* If there are no bits set in sbcset, there is no point
3289 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3290 if (sbc_idx < BITSET_WORDS)
3292 /* Build a tree for simple bracket. */
3293 br_token.type = SIMPLE_BRACKET;
3294 br_token.opr.sbcset = sbcset;
3295 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296 if (BE (work_tree == NULL, 0))
3297 goto parse_bracket_exp_espace;
3299 /* Then join them by ALT node. */
3300 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3301 if (BE (work_tree == NULL, 0))
3302 goto parse_bracket_exp_espace;
3307 work_tree = mbc_tree;
3311 #endif /* not RE_ENABLE_I18N */
3313 #ifdef RE_ENABLE_I18N
3314 free_charset (mbcset);
3316 /* Build a tree for simple bracket. */
3317 br_token.type = SIMPLE_BRACKET;
3318 br_token.opr.sbcset = sbcset;
3319 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3320 if (BE (work_tree == NULL, 0))
3321 goto parse_bracket_exp_espace;
3325 parse_bracket_exp_espace:
3327 parse_bracket_exp_free_return:
3329 #ifdef RE_ENABLE_I18N
3330 free_charset (mbcset);
3331 #endif /* RE_ENABLE_I18N */
3335 /* Parse an element in the bracket expression. */
3337 static reg_errcode_t
3338 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3339 re_token_t *token, int token_len, re_dfa_t *dfa,
3340 reg_syntax_t syntax, int accept_hyphen)
3342 #ifdef RE_ENABLE_I18N
3344 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3345 if (cur_char_size > 1)
3347 elem->type = MB_CHAR;
3348 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3349 re_string_skip_bytes (regexp, cur_char_size);
3352 #endif /* RE_ENABLE_I18N */
3353 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3354 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3355 || token->type == OP_OPEN_EQUIV_CLASS)
3356 return parse_bracket_symbol (elem, regexp, token);
3357 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3359 /* A '-' must only appear as anything but a range indicator before
3360 the closing bracket. Everything else is an error. */
3362 (void) peek_token_bracket (&token2, regexp, syntax);
3363 if (token2.type != OP_CLOSE_BRACKET)
3364 /* The actual error value is not standardized since this whole
3365 case is undefined. But ERANGE makes good sense. */
3368 elem->type = SB_CHAR;
3369 elem->opr.ch = token->opr.c;
3373 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3374 such as [:<character_class>:], [.<collating_element>.], and
3375 [=<equivalent_class>=]. */
3377 static reg_errcode_t
3378 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3381 unsigned char ch, delim = token->opr.c;
3383 if (re_string_eoi(regexp))
3387 if (i >= BRACKET_NAME_BUF_SIZE)
3389 if (token->type == OP_OPEN_CHAR_CLASS)
3390 ch = re_string_fetch_byte_case (regexp);
3392 ch = re_string_fetch_byte (regexp);
3393 if (re_string_eoi(regexp))
3395 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3397 elem->opr.name[i] = ch;
3399 re_string_skip_bytes (regexp, 1);
3400 elem->opr.name[i] = '\0';
3401 switch (token->type)
3403 case OP_OPEN_COLL_ELEM:
3404 elem->type = COLL_SYM;
3406 case OP_OPEN_EQUIV_CLASS:
3407 elem->type = EQUIV_CLASS;
3409 case OP_OPEN_CHAR_CLASS:
3410 elem->type = CHAR_CLASS;
3418 /* Helper function for parse_bracket_exp.
3419 Build the equivalence class which is represented by NAME.
3420 The result are written to MBCSET and SBCSET.
3421 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3422 is a pointer argument since we may update it. */
3424 static reg_errcode_t
3425 #ifdef RE_ENABLE_I18N
3426 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3427 int *equiv_class_alloc, const unsigned char *name)
3428 #else /* not RE_ENABLE_I18N */
3429 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3430 #endif /* not RE_ENABLE_I18N */
3433 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3436 const int32_t *table, *indirect;
3437 const unsigned char *weights, *extra, *cp;
3438 unsigned char char_buf[2];
3442 /* This #include defines a local function! */
3443 # include <locale/weight.h>
3444 /* Calculate the index for equivalence class. */
3446 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3447 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3448 _NL_COLLATE_WEIGHTMB);
3449 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3450 _NL_COLLATE_EXTRAMB);
3451 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3452 _NL_COLLATE_INDIRECTMB);
3453 idx1 = findidx (&cp);
3454 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3455 /* This isn't a valid character. */
3456 return REG_ECOLLATE;
3458 /* Build single byte matcing table for this equivalence class. */
3459 char_buf[1] = (unsigned char) '\0';
3460 len = weights[idx1 & 0xffffff];
3461 for (ch = 0; ch < SBC_MAX; ++ch)
3465 idx2 = findidx (&cp);
3470 /* This isn't a valid character. */
3472 /* Compare only if the length matches and the collation rule
3473 index is the same. */
3474 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3478 while (cnt <= len &&
3479 weights[(idx1 & 0xffffff) + 1 + cnt]
3480 == weights[(idx2 & 0xffffff) + 1 + cnt])
3484 bitset_set (sbcset, ch);
3487 /* Check whether the array has enough space. */
3488 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3490 /* Not enough, realloc it. */
3491 /* +1 in case of mbcset->nequiv_classes is 0. */
3492 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3493 /* Use realloc since the array is NULL if *alloc == 0. */
3494 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3496 new_equiv_class_alloc);
3497 if (BE (new_equiv_classes == NULL, 0))
3499 mbcset->equiv_classes = new_equiv_classes;
3500 *equiv_class_alloc = new_equiv_class_alloc;
3502 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3507 if (BE (strlen ((const char *) name) != 1, 0))
3508 return REG_ECOLLATE;
3509 bitset_set (sbcset, *name);
3514 /* Helper function for parse_bracket_exp.
3515 Build the character class which is represented by NAME.
3516 The result are written to MBCSET and SBCSET.
3517 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3518 is a pointer argument since we may update it. */
3520 static reg_errcode_t
3521 #ifdef RE_ENABLE_I18N
3522 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3523 re_charset_t *mbcset, int *char_class_alloc,
3524 const char *class_name, reg_syntax_t syntax)
3525 #else /* not RE_ENABLE_I18N */
3526 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3527 const char *class_name, reg_syntax_t syntax)
3528 #endif /* not RE_ENABLE_I18N */
3532 /* In case of REG_ICASE "upper" and "lower" match the both of
3533 upper and lower cases. */
3534 if ((syntax & RE_ICASE)
3535 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3536 class_name = "alpha";
3538 #ifdef RE_ENABLE_I18N
3539 /* Check the space of the arrays. */
3540 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3542 /* Not enough, realloc it. */
3543 /* +1 in case of mbcset->nchar_classes is 0. */
3544 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3545 /* Use realloc since array is NULL if *alloc == 0. */
3546 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3547 new_char_class_alloc);
3548 if (BE (new_char_classes == NULL, 0))
3550 mbcset->char_classes = new_char_classes;
3551 *char_class_alloc = new_char_class_alloc;
3553 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3554 #endif /* RE_ENABLE_I18N */
3556 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3558 if (BE (trans != NULL, 0)) \
3560 for (i = 0; i < SBC_MAX; ++i) \
3561 if (ctype_func (i)) \
3562 bitset_set (sbcset, trans[i]); \
3566 for (i = 0; i < SBC_MAX; ++i) \
3567 if (ctype_func (i)) \
3568 bitset_set (sbcset, i); \
3572 if (strcmp (class_name, "alnum") == 0)
3573 BUILD_CHARCLASS_LOOP (isalnum);
3574 else if (strcmp (class_name, "cntrl") == 0)
3575 BUILD_CHARCLASS_LOOP (iscntrl);
3576 else if (strcmp (class_name, "lower") == 0)
3577 BUILD_CHARCLASS_LOOP (islower);
3578 else if (strcmp (class_name, "space") == 0)
3579 BUILD_CHARCLASS_LOOP (isspace);
3580 else if (strcmp (class_name, "alpha") == 0)
3581 BUILD_CHARCLASS_LOOP (isalpha);
3582 else if (strcmp (class_name, "digit") == 0)
3583 BUILD_CHARCLASS_LOOP (isdigit);
3584 else if (strcmp (class_name, "print") == 0)
3585 BUILD_CHARCLASS_LOOP (isprint);
3586 else if (strcmp (class_name, "upper") == 0)
3587 BUILD_CHARCLASS_LOOP (isupper);
3588 else if (strcmp (class_name, "blank") == 0)
3590 BUILD_CHARCLASS_LOOP (isblank);
3592 /* see comments above */
3593 BUILD_CHARCLASS_LOOP (is_blank);
3595 else if (strcmp (class_name, "graph") == 0)
3596 BUILD_CHARCLASS_LOOP (isgraph);
3597 else if (strcmp (class_name, "punct") == 0)
3598 BUILD_CHARCLASS_LOOP (ispunct);
3599 else if (strcmp (class_name, "xdigit") == 0)
3600 BUILD_CHARCLASS_LOOP (isxdigit);
3608 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3609 const char *class_name,
3610 const char *extra, int non_match,
3613 re_bitset_ptr_t sbcset;
3614 #ifdef RE_ENABLE_I18N
3615 re_charset_t *mbcset;
3617 #endif /* not RE_ENABLE_I18N */
3619 re_token_t br_token;
3622 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3623 #ifdef RE_ENABLE_I18N
3624 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3625 #endif /* RE_ENABLE_I18N */
3627 #ifdef RE_ENABLE_I18N
3628 if (BE (sbcset == NULL || mbcset == NULL, 0))
3629 #else /* not RE_ENABLE_I18N */
3630 if (BE (sbcset == NULL, 0))
3631 #endif /* not RE_ENABLE_I18N */
3639 #ifdef RE_ENABLE_I18N
3640 mbcset->non_match = 1;
3641 #endif /* not RE_ENABLE_I18N */
3644 /* We don't care the syntax in this case. */
3645 ret = build_charclass (trans, sbcset,
3646 #ifdef RE_ENABLE_I18N
3648 #endif /* RE_ENABLE_I18N */
3651 if (BE (ret != REG_NOERROR, 0))
3654 #ifdef RE_ENABLE_I18N
3655 free_charset (mbcset);
3656 #endif /* RE_ENABLE_I18N */
3660 /* \w match '_' also. */
3661 for (; *extra; extra++)
3662 bitset_set (sbcset, *extra);
3664 /* If it is non-matching list. */
3666 bitset_not (sbcset);
3668 #ifdef RE_ENABLE_I18N
3669 /* Ensure only single byte characters are set. */
3670 if (dfa->mb_cur_max > 1)
3671 bitset_mask (sbcset, dfa->sb_char);
3674 /* Build a tree for simple bracket. */
3675 br_token.type = SIMPLE_BRACKET;
3676 br_token.opr.sbcset = sbcset;
3677 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3678 if (BE (tree == NULL, 0))
3679 goto build_word_op_espace;
3681 #ifdef RE_ENABLE_I18N
3682 if (dfa->mb_cur_max > 1)
3684 bin_tree_t *mbc_tree;
3685 /* Build a tree for complex bracket. */
3686 br_token.type = COMPLEX_BRACKET;
3687 br_token.opr.mbcset = mbcset;
3688 dfa->has_mb_node = 1;
3689 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3690 if (BE (mbc_tree == NULL, 0))
3691 goto build_word_op_espace;
3692 /* Then join them by ALT node. */
3693 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3694 if (BE (mbc_tree != NULL, 1))
3699 free_charset (mbcset);
3702 #else /* not RE_ENABLE_I18N */
3704 #endif /* not RE_ENABLE_I18N */
3706 build_word_op_espace:
3708 #ifdef RE_ENABLE_I18N
3709 free_charset (mbcset);
3710 #endif /* RE_ENABLE_I18N */
3715 /* This is intended for the expressions like "a{1,3}".
3716 Fetch a number from `input', and return the number.
3717 Return -1, if the number field is empty like "{,1}".
3718 Return -2, if an error has occurred. */
3721 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3727 fetch_token (token, input, syntax);
3729 if (BE (token->type == END_OF_RE, 0))
3731 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3733 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3734 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3735 num = (num > RE_DUP_MAX) ? -2 : num;
3740 #ifdef RE_ENABLE_I18N
3742 free_charset (re_charset_t *cset)
3744 re_free (cset->mbchars);
3746 re_free (cset->coll_syms);
3747 re_free (cset->equiv_classes);
3748 re_free (cset->range_starts);
3749 re_free (cset->range_ends);
3751 re_free (cset->char_classes);
3754 #endif /* RE_ENABLE_I18N */
3756 /* Functions for binary tree operation. */
3758 /* Create a tree node. */
3761 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3762 re_token_type_t type)
3766 return create_token_tree (dfa, left, right, &t);
3770 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3771 const re_token_t *token)
3774 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3776 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3778 if (storage == NULL)
3780 storage->next = dfa->str_tree_storage;
3781 dfa->str_tree_storage = storage;
3782 dfa->str_tree_storage_idx = 0;
3784 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3786 tree->parent = NULL;
3788 tree->right = right;
3789 tree->token = *token;
3790 tree->token.duplicated = 0;
3791 tree->token.opt_subexp = 0;
3794 tree->node_idx = -1;
3797 left->parent = tree;
3799 right->parent = tree;
3803 /* Mark the tree SRC as an optional subexpression.
3804 To be called from preorder or postorder. */
3806 static reg_errcode_t
3807 mark_opt_subexp (void *extra, bin_tree_t *node)
3809 int idx = (int) (long) extra;
3810 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3811 node->token.opt_subexp = 1;
3816 /* Free the allocated memory inside NODE. */
3819 free_token (re_token_t *node)
3821 #ifdef RE_ENABLE_I18N
3822 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3823 free_charset (node->opr.mbcset);
3825 #endif /* RE_ENABLE_I18N */
3826 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3827 re_free (node->opr.sbcset);
3830 /* Worker function for tree walking. Free the allocated memory inside NODE
3831 and its children. */
3833 static reg_errcode_t
3834 free_tree (void *extra, bin_tree_t *node)
3836 free_token (&node->token);
3841 /* Duplicate the node SRC, and return new node. This is a preorder
3842 visit similar to the one implemented by the generic visitor, but
3843 we need more infrastructure to maintain two parallel trees --- so,
3844 it's easier to duplicate. */
3847 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3849 const bin_tree_t *node;
3850 bin_tree_t *dup_root;
3851 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3853 for (node = root; ; )
3855 /* Create a new tree and link it back to the current parent. */
3856 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3859 (*p_new)->parent = dup_node;
3860 (*p_new)->token.duplicated = 1;
3863 /* Go to the left node, or up and to the right. */
3867 p_new = &dup_node->left;
3871 const bin_tree_t *prev = NULL;
3872 while (node->right == prev || node->right == NULL)
3875 node = node->parent;
3876 dup_node = dup_node->parent;
3881 p_new = &dup_node->right;