1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 re_token_t *token, int token_len,
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 int *char_class_alloc,
98 const char *class_name,
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105 const char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const char *class_name,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid[] attribute_hidden =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx[] attribute_hidden =
204 /* Entry points for GNU code. */
209 /* For ZOS USS we must define btowc */
220 mbtowc (wtmp, tmp, 1);
225 /* re_compile_pattern is the GNU regular expression compiler: it
226 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
227 Returns 0 if the pattern was valid, otherwise an error string.
229 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
230 are set in BUFP on entry. */
233 re_compile_pattern (pattern, length, bufp)
236 struct re_pattern_buffer *bufp;
240 /* And GNU code determines whether or not to get register information
241 by passing null for the REGS argument to re_match, etc., not by
242 setting no_sub, unless RE_NO_SUB is set. */
243 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
245 /* Match anchors at newline. */
246 bufp->newline_anchor = 1;
248 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
252 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
255 weak_alias (__re_compile_pattern, re_compile_pattern)
258 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
259 also be assigned to arbitrarily: each pattern buffer stores its own
260 syntax, so it can be changed between regex compilations. */
261 /* This has no initializer because initialized variables in Emacs
262 become read-only after dumping. */
263 reg_syntax_t re_syntax_options;
266 /* Specify the precise syntax of regexps for compilation. This provides
267 for compatibility for various utilities which historically have
268 different, incompatible syntaxes.
270 The argument SYNTAX is a bit mask comprised of the various bits
271 defined in regex.h. We return the old syntax. */
274 re_set_syntax (syntax)
277 reg_syntax_t ret = re_syntax_options;
279 re_syntax_options = syntax;
283 weak_alias (__re_set_syntax, re_set_syntax)
287 re_compile_fastmap (bufp)
288 struct re_pattern_buffer *bufp;
290 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
291 char *fastmap = bufp->fastmap;
293 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
294 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
295 if (dfa->init_state != dfa->init_state_word)
296 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
297 if (dfa->init_state != dfa->init_state_nl)
298 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
299 if (dfa->init_state != dfa->init_state_begbuf)
300 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
301 bufp->fastmap_accurate = 1;
305 weak_alias (__re_compile_fastmap, re_compile_fastmap)
309 __attribute ((always_inline))
310 re_set_fastmap (char *fastmap, int icase, int ch)
314 fastmap[tolower (ch)] = 1;
317 /* Helper function for re_compile_fastmap.
318 Compile fastmap for the initial_state INIT_STATE. */
321 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
324 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
326 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
327 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
329 int node = init_state->nodes.elems[node_cnt];
330 re_token_type_t type = dfa->nodes[node].type;
332 if (type == CHARACTER)
334 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
335 #ifdef RE_ENABLE_I18N
336 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
338 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
343 *p++ = dfa->nodes[node].opr.c;
344 while (++node < dfa->nodes_len
345 && dfa->nodes[node].type == CHARACTER
346 && dfa->nodes[node].mb_partial)
347 *p++ = dfa->nodes[node].opr.c;
348 memset (&state, '\0', sizeof (state));
349 if (__mbrtowc (&wc, (const char *) buf, p - buf,
351 && (__wcrtomb ((char *) buf, towlower (wc), &state)
353 re_set_fastmap (fastmap, 0, buf[0]);
358 else if (type == SIMPLE_BRACKET)
361 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
364 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
365 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
366 if (w & ((bitset_word_t) 1 << j))
367 re_set_fastmap (fastmap, icase, ch);
370 #ifdef RE_ENABLE_I18N
371 else if (type == COMPLEX_BRACKET)
373 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
377 /* See if we have to try all bytes which start multiple collation
379 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
380 collation element, and don't catch 'b' since 'b' is
381 the only collation element which starts from 'b' (and
382 it is caught by SIMPLE_BRACKET). */
383 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
384 && (cset->ncoll_syms || cset->nranges))
386 const int32_t *table = (const int32_t *)
387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
388 for (i = 0; i < SBC_MAX; ++i)
390 re_set_fastmap (fastmap, icase, i);
394 /* See if we have to start the match at all multibyte characters,
395 i.e. where we would not find an invalid sequence. This only
396 applies to multibyte character sets; for single byte character
397 sets, the SIMPLE_BRACKET again suffices. */
398 if (dfa->mb_cur_max > 1
399 && (cset->nchar_classes || cset->non_match || cset->nranges
401 || cset->nequiv_classes
409 memset (&mbs, 0, sizeof (mbs));
410 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
411 re_set_fastmap (fastmap, false, (int) c);
418 /* ... Else catch all bytes which can start the mbchars. */
419 for (i = 0; i < cset->nmbchars; ++i)
423 memset (&state, '\0', sizeof (state));
424 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
425 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
426 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
428 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
430 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
435 #endif /* RE_ENABLE_I18N */
436 else if (type == OP_PERIOD
437 #ifdef RE_ENABLE_I18N
438 || type == OP_UTF8_PERIOD
439 #endif /* RE_ENABLE_I18N */
440 || type == END_OF_RE)
442 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
443 if (type == END_OF_RE)
444 bufp->can_be_null = 1;
450 /* Entry point for POSIX code. */
451 /* regcomp takes a regular expression as a string and compiles it.
453 PREG is a regex_t *. We do not expect any fields to be initialized,
454 since POSIX says we shouldn't. Thus, we set
456 `buffer' to the compiled pattern;
457 `used' to the length of the compiled pattern;
458 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
459 REG_EXTENDED bit in CFLAGS is set; otherwise, to
460 RE_SYNTAX_POSIX_BASIC;
461 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
462 `fastmap' to an allocated space for the fastmap;
463 `fastmap_accurate' to zero;
464 `re_nsub' to the number of subexpressions in PATTERN.
466 PATTERN is the address of the pattern string.
468 CFLAGS is a series of bits which affect compilation.
470 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
471 use POSIX basic syntax.
473 If REG_NEWLINE is set, then . and [^...] don't match newline.
474 Also, regexec will try a match beginning after every newline.
476 If REG_ICASE is set, then we considers upper- and lowercase
477 versions of letters to be equivalent when matching.
479 If REG_NOSUB is set, then when PREG is passed to regexec, that
480 routine will report only success or failure, and nothing about the
483 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
484 the return codes and their meanings.) */
487 regcomp (preg, pattern, cflags)
488 regex_t *__restrict preg;
489 const char *__restrict pattern;
493 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
494 : RE_SYNTAX_POSIX_BASIC);
500 /* Try to allocate space for the fastmap. */
501 preg->fastmap = re_malloc (char, SBC_MAX);
502 if (BE (preg->fastmap == NULL, 0))
505 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
507 /* If REG_NEWLINE is set, newlines are treated differently. */
508 if (cflags & REG_NEWLINE)
509 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
510 syntax &= ~RE_DOT_NEWLINE;
511 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
512 /* It also changes the matching behavior. */
513 preg->newline_anchor = 1;
516 preg->newline_anchor = 0;
517 preg->no_sub = !!(cflags & REG_NOSUB);
518 preg->translate = NULL;
520 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
522 /* POSIX doesn't distinguish between an unmatched open-group and an
523 unmatched close-group: both are REG_EPAREN. */
524 if (ret == REG_ERPAREN)
527 /* We have already checked preg->fastmap != NULL. */
528 if (BE (ret == REG_NOERROR, 1))
529 /* Compute the fastmap now, since regexec cannot modify the pattern
530 buffer. This function never fails in this implementation. */
531 (void) re_compile_fastmap (preg);
534 /* Some error occurred while compiling the expression. */
535 re_free (preg->fastmap);
536 preg->fastmap = NULL;
542 weak_alias (__regcomp, regcomp)
545 /* Returns a message corresponding to an error code, ERRCODE, returned
546 from either regcomp or regexec. We don't use PREG here. */
549 regerror(int errcode, const regex_t *__restrict preg,
550 char *__restrict errbuf, size_t errbuf_size)
556 || errcode >= (int) (sizeof (__re_error_msgid_idx)
557 / sizeof (__re_error_msgid_idx[0])), 0))
558 /* Only error codes returned by the rest of the code should be passed
559 to this routine. If we are given anything else, or if other regex
560 code generates an invalid error code, then the program has a bug.
561 Dump core so we can fix it. */
564 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
566 msg_size = strlen (msg) + 1; /* Includes the null. */
568 if (BE (errbuf_size != 0, 1))
570 if (BE (msg_size > errbuf_size, 0))
572 memcpy (errbuf, msg, errbuf_size - 1);
573 errbuf[errbuf_size - 1] = 0;
576 memcpy (errbuf, msg, msg_size);
582 weak_alias (__regerror, regerror)
586 #ifdef RE_ENABLE_I18N
587 /* This static array is used for the map to single-byte characters when
588 UTF-8 is used. Otherwise we would allocate memory just to initialize
589 it the same all the time. UTF-8 is the preferred encoding so this is
590 a worthwhile optimization. */
592 static const bitset_t utf8_sb_map = {
593 /* Set the first 128 bits. */
594 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
596 #else /* ! (__GNUC__ >= 3) */
597 static bitset_t utf8_sb_map;
598 #endif /* __GNUC__ >= 3 */
599 #endif /* RE_ENABLE_I18N */
603 free_dfa_content (re_dfa_t *dfa)
608 for (i = 0; i < dfa->nodes_len; ++i)
609 free_token (dfa->nodes + i);
610 re_free (dfa->nexts);
611 for (i = 0; i < dfa->nodes_len; ++i)
613 if (dfa->eclosures != NULL)
614 re_node_set_free (dfa->eclosures + i);
615 if (dfa->inveclosures != NULL)
616 re_node_set_free (dfa->inveclosures + i);
617 if (dfa->edests != NULL)
618 re_node_set_free (dfa->edests + i);
620 re_free (dfa->edests);
621 re_free (dfa->eclosures);
622 re_free (dfa->inveclosures);
623 re_free (dfa->nodes);
625 if (dfa->state_table)
626 for (i = 0; i <= dfa->state_hash_mask; ++i)
628 struct re_state_table_entry *entry = dfa->state_table + i;
629 for (j = 0; j < entry->num; ++j)
631 re_dfastate_t *state = entry->array[j];
634 re_free (entry->array);
636 re_free (dfa->state_table);
637 #ifdef RE_ENABLE_I18N
638 if (dfa->sb_char != utf8_sb_map)
639 re_free (dfa->sb_char);
641 re_free (dfa->subexp_map);
643 re_free (dfa->re_str);
650 /* Free dynamically allocated space used by PREG. */
656 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
657 if (BE (dfa != NULL, 1))
658 free_dfa_content (dfa);
662 re_free (preg->fastmap);
663 preg->fastmap = NULL;
665 re_free (preg->translate);
666 preg->translate = NULL;
669 weak_alias (__regfree, regfree)
672 /* Entry points compatible with 4.2 BSD regex library. We don't define
673 them unless specifically requested. */
675 #if defined _REGEX_RE_COMP || defined _LIBC
677 /* BSD has one and only one pattern buffer. */
678 static struct re_pattern_buffer re_comp_buf;
682 /* Make these definitions weak in libc, so POSIX programs can redefine
683 these names if they don't use our functions, and still use
684 regcomp/regexec above without link errors. */
695 if (!re_comp_buf.buffer)
696 return gettext ("No previous regular expression");
700 if (re_comp_buf.buffer)
702 fastmap = re_comp_buf.fastmap;
703 re_comp_buf.fastmap = NULL;
704 __regfree (&re_comp_buf);
705 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
706 re_comp_buf.fastmap = fastmap;
709 if (re_comp_buf.fastmap == NULL)
711 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
712 if (re_comp_buf.fastmap == NULL)
713 return (char *) gettext (__re_error_msgid
714 + __re_error_msgid_idx[(int) REG_ESPACE]);
717 /* Since `re_exec' always passes NULL for the `regs' argument, we
718 don't need to initialize the pattern buffer fields which affect it. */
720 /* Match anchors at newlines. */
721 re_comp_buf.newline_anchor = 1;
723 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
728 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
729 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
733 libc_freeres_fn (free_mem)
735 __regfree (&re_comp_buf);
739 #endif /* _REGEX_RE_COMP */
741 /* Internal entry point.
742 Compile the regular expression PATTERN, whose length is LENGTH.
743 SYNTAX indicate regular expression's syntax. */
746 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
749 reg_errcode_t err = REG_NOERROR;
753 /* Initialize the pattern buffer. */
754 preg->fastmap_accurate = 0;
755 preg->syntax = syntax;
756 preg->not_bol = preg->not_eol = 0;
759 preg->can_be_null = 0;
760 preg->regs_allocated = REGS_UNALLOCATED;
762 /* Initialize the dfa. */
763 dfa = (re_dfa_t *) preg->buffer;
764 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
766 /* If zero allocated, but buffer is non-null, try to realloc
767 enough space. This loses if buffer's address is bogus, but
768 that is the user's responsibility. If ->buffer is NULL this
769 is a simple allocation. */
770 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
773 preg->allocated = sizeof (re_dfa_t);
774 preg->buffer = (unsigned char *) dfa;
776 preg->used = sizeof (re_dfa_t);
778 err = init_dfa (dfa, length);
779 if (BE (err != REG_NOERROR, 0))
781 free_dfa_content (dfa);
787 /* Note: length+1 will not overflow since it is checked in init_dfa. */
788 dfa->re_str = re_malloc (char, length + 1);
789 strncpy (dfa->re_str, pattern, length + 1);
792 __libc_lock_init (dfa->lock);
794 err = re_string_construct (®exp, pattern, length, preg->translate,
795 syntax & RE_ICASE, dfa);
796 if (BE (err != REG_NOERROR, 0))
798 re_compile_internal_free_return:
799 free_workarea_compile (preg);
800 re_string_destruct (®exp);
801 free_dfa_content (dfa);
807 /* Parse the regular expression, and build a structure tree. */
809 dfa->str_tree = parse (®exp, preg, syntax, &err);
810 if (BE (dfa->str_tree == NULL, 0))
811 goto re_compile_internal_free_return;
813 /* Analyze the tree and create the nfa. */
814 err = analyze (preg);
815 if (BE (err != REG_NOERROR, 0))
816 goto re_compile_internal_free_return;
818 #ifdef RE_ENABLE_I18N
819 /* If possible, do searching in single byte encoding to speed things up. */
820 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
824 /* Then create the initial state of the dfa. */
825 err = create_initial_state (dfa);
827 /* Release work areas. */
828 free_workarea_compile (preg);
829 re_string_destruct (®exp);
831 if (BE (err != REG_NOERROR, 0))
833 free_dfa_content (dfa);
841 /* Initialize DFA. We use the length of the regular expression PAT_LEN
842 as the initial length of some arrays. */
845 init_dfa (re_dfa_t *dfa, size_t pat_len)
847 unsigned int table_size;
852 memset (dfa, '\0', sizeof (re_dfa_t));
854 /* Force allocation of str_tree_storage the first time. */
855 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
857 /* Avoid overflows. */
858 if (pat_len == SIZE_MAX)
861 dfa->nodes_alloc = pat_len + 1;
862 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
864 /* table_size = 2 ^ ceil(log pat_len) */
865 for (table_size = 1; ; table_size <<= 1)
866 if (table_size > pat_len)
869 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
870 dfa->state_hash_mask = table_size - 1;
872 dfa->mb_cur_max = MB_CUR_MAX;
874 if (dfa->mb_cur_max == 6
875 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
877 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
880 # ifdef HAVE_LANGINFO_CODESET
881 codeset_name = nl_langinfo (CODESET);
883 codeset_name = getenv ("LC_ALL");
884 if (codeset_name == NULL || codeset_name[0] == '\0')
885 codeset_name = getenv ("LC_CTYPE");
886 if (codeset_name == NULL || codeset_name[0] == '\0')
887 codeset_name = getenv ("LANG");
888 if (codeset_name == NULL)
890 else if (strchr (codeset_name, '.') != NULL)
891 codeset_name = strchr (codeset_name, '.') + 1;
894 /* strcasecmp isn't a standard interface. brute force check */
896 if (strcasecmp (codeset_name, "UTF-8") == 0
897 || strcasecmp (codeset_name, "UTF8") == 0)
900 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
901 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
902 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
903 && (codeset_name[3] == '-'
904 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
905 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
909 /* We check exhaustively in the loop below if this charset is a
910 superset of ASCII. */
911 dfa->map_notascii = 0;
914 #ifdef RE_ENABLE_I18N
915 if (dfa->mb_cur_max > 1)
919 #if !defined(__GNUC__) || __GNUC__ < 3
920 static short utf8_sb_map_inited = 0;
922 if (! utf8_sb_map_inited)
926 utf8_sb_map_inited = 0;
927 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
928 utf8_sb_map[i] = BITSET_WORD_MAX;
931 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
937 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
938 if (BE (dfa->sb_char == NULL, 0))
941 /* Set the bits corresponding to single byte chars. */
942 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
943 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
945 wint_t wch = __btowc (ch);
947 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
949 if (isascii (ch) && wch != ch)
950 dfa->map_notascii = 1;
957 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
962 /* Initialize WORD_CHAR table, which indicate which character is
963 "word". In this case "word" means that it is the word construction
964 character used by some operators like "\<", "\>", etc. */
968 init_word_char (re_dfa_t *dfa)
971 dfa->word_ops_used = 1;
972 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
973 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
974 if (isalnum (ch) || ch == '_')
975 dfa->word_char[i] |= (bitset_word_t) 1 << j;
978 /* Free the work area which are only used while compiling. */
981 free_workarea_compile (regex_t *preg)
983 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
984 bin_tree_storage_t *storage, *next;
985 for (storage = dfa->str_tree_storage; storage; storage = next)
987 next = storage->next;
990 dfa->str_tree_storage = NULL;
991 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
992 dfa->str_tree = NULL;
993 re_free (dfa->org_indices);
994 dfa->org_indices = NULL;
997 /* Create initial states for all contexts. */
1000 create_initial_state (re_dfa_t *dfa)
1004 re_node_set init_nodes;
1006 /* Initial states have the epsilon closure of the node which is
1007 the first node of the regular expression. */
1008 first = dfa->str_tree->first->node_idx;
1009 dfa->init_node = first;
1010 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1011 if (BE (err != REG_NOERROR, 0))
1014 /* The back-references which are in initial states can epsilon transit,
1015 since in this case all of the subexpressions can be null.
1016 Then we add epsilon closures of the nodes which are the next nodes of
1017 the back-references. */
1018 if (dfa->nbackref > 0)
1019 for (i = 0; i < init_nodes.nelem; ++i)
1021 int node_idx = init_nodes.elems[i];
1022 re_token_type_t type = dfa->nodes[node_idx].type;
1025 if (type != OP_BACK_REF)
1027 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1029 re_token_t *clexp_node;
1030 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1031 if (clexp_node->type == OP_CLOSE_SUBEXP
1032 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1035 if (clexp_idx == init_nodes.nelem)
1038 if (type == OP_BACK_REF)
1040 int dest_idx = dfa->edests[node_idx].elems[0];
1041 if (!re_node_set_contains (&init_nodes, dest_idx))
1043 reg_errcode_t err = re_node_set_merge (&init_nodes,
1046 if (err != REG_NOERROR)
1053 /* It must be the first time to invoke acquire_state. */
1054 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1055 /* We don't check ERR here, since the initial state must not be NULL. */
1056 if (BE (dfa->init_state == NULL, 0))
1058 if (dfa->init_state->has_constraint)
1060 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1062 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1064 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1068 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1069 || dfa->init_state_begbuf == NULL, 0))
1073 dfa->init_state_word = dfa->init_state_nl
1074 = dfa->init_state_begbuf = dfa->init_state;
1076 re_node_set_free (&init_nodes);
1080 #ifdef RE_ENABLE_I18N
1081 /* If it is possible to do searching in single byte encoding instead of UTF-8
1082 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1083 DFA nodes where needed. */
1086 optimize_utf8 (re_dfa_t *dfa)
1088 int node, i, mb_chars = 0, has_period = 0;
1090 for (node = 0; node < dfa->nodes_len; ++node)
1091 switch (dfa->nodes[node].type)
1094 if (dfa->nodes[node].opr.c >= 0x80)
1098 switch (dfa->nodes[node].opr.ctx_type)
1106 /* Word anchors etc. cannot be handled. It's okay to test
1107 opr.ctx_type since constraints (for all DFA nodes) are
1108 created by ORing one or more opr.ctx_type values. */
1118 case OP_DUP_ASTERISK:
1119 case OP_OPEN_SUBEXP:
1120 case OP_CLOSE_SUBEXP:
1122 case COMPLEX_BRACKET:
1124 case SIMPLE_BRACKET:
1125 /* Just double check. The non-ASCII range starts at 0x80. */
1126 assert (0x80 % BITSET_WORD_BITS == 0);
1127 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1128 if (dfa->nodes[node].opr.sbcset[i])
1135 if (mb_chars || has_period)
1136 for (node = 0; node < dfa->nodes_len; ++node)
1138 if (dfa->nodes[node].type == CHARACTER
1139 && dfa->nodes[node].opr.c >= 0x80)
1140 dfa->nodes[node].mb_partial = 0;
1141 else if (dfa->nodes[node].type == OP_PERIOD)
1142 dfa->nodes[node].type = OP_UTF8_PERIOD;
1145 /* The search can be in single byte locale. */
1146 dfa->mb_cur_max = 1;
1148 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1152 /* Analyze the structure tree, and calculate "first", "next", "edest",
1153 "eclosure", and "inveclosure". */
1155 static reg_errcode_t
1156 analyze (regex_t *preg)
1158 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1161 /* Allocate arrays. */
1162 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1163 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1164 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1165 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1166 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1167 || dfa->eclosures == NULL, 0))
1170 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1171 if (dfa->subexp_map != NULL)
1174 for (i = 0; i < preg->re_nsub; i++)
1175 dfa->subexp_map[i] = i;
1176 preorder (dfa->str_tree, optimize_subexps, dfa);
1177 for (i = 0; i < preg->re_nsub; i++)
1178 if (dfa->subexp_map[i] != i)
1180 if (i == preg->re_nsub)
1182 free (dfa->subexp_map);
1183 dfa->subexp_map = NULL;
1187 ret = postorder (dfa->str_tree, lower_subexps, preg);
1188 if (BE (ret != REG_NOERROR, 0))
1190 ret = postorder (dfa->str_tree, calc_first, dfa);
1191 if (BE (ret != REG_NOERROR, 0))
1193 preorder (dfa->str_tree, calc_next, dfa);
1194 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1195 if (BE (ret != REG_NOERROR, 0))
1197 ret = calc_eclosure (dfa);
1198 if (BE (ret != REG_NOERROR, 0))
1201 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1202 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1203 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1206 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1207 if (BE (dfa->inveclosures == NULL, 0))
1209 ret = calc_inveclosure (dfa);
1215 /* Our parse trees are very unbalanced, so we cannot use a stack to
1216 implement parse tree visits. Instead, we use parent pointers and
1217 some hairy code in these two functions. */
1218 static reg_errcode_t
1219 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1222 bin_tree_t *node, *prev;
1224 for (node = root; ; )
1226 /* Descend down the tree, preferably to the left (or to the right
1227 if that's the only child). */
1228 while (node->left || node->right)
1236 reg_errcode_t err = fn (extra, node);
1237 if (BE (err != REG_NOERROR, 0))
1239 if (node->parent == NULL)
1242 node = node->parent;
1244 /* Go up while we have a node that is reached from the right. */
1245 while (node->right == prev || node->right == NULL);
1250 static reg_errcode_t
1251 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1256 for (node = root; ; )
1258 reg_errcode_t err = fn (extra, node);
1259 if (BE (err != REG_NOERROR, 0))
1262 /* Go to the left node, or up and to the right. */
1267 bin_tree_t *prev = NULL;
1268 while (node->right == prev || node->right == NULL)
1271 node = node->parent;
1280 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1281 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1282 backreferences as well. Requires a preorder visit. */
1283 static reg_errcode_t
1284 optimize_subexps (void *extra, bin_tree_t *node)
1286 re_dfa_t *dfa = (re_dfa_t *) extra;
1288 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1290 int idx = node->token.opr.idx;
1291 node->token.opr.idx = dfa->subexp_map[idx];
1292 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1295 else if (node->token.type == SUBEXP
1296 && node->left && node->left->token.type == SUBEXP)
1298 int other_idx = node->left->token.opr.idx;
1300 node->left = node->left->left;
1302 node->left->parent = node;
1304 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1305 if (other_idx < BITSET_WORD_BITS)
1306 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1312 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1313 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1314 static reg_errcode_t
1315 lower_subexps (void *extra, bin_tree_t *node)
1317 regex_t *preg = (regex_t *) extra;
1318 reg_errcode_t err = REG_NOERROR;
1320 if (node->left && node->left->token.type == SUBEXP)
1322 node->left = lower_subexp (&err, preg, node->left);
1324 node->left->parent = node;
1326 if (node->right && node->right->token.type == SUBEXP)
1328 node->right = lower_subexp (&err, preg, node->right);
1330 node->right->parent = node;
1337 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1339 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1340 bin_tree_t *body = node->left;
1341 bin_tree_t *op, *cls, *tree1, *tree;
1344 /* We do not optimize empty subexpressions, because otherwise we may
1345 have bad CONCAT nodes with NULL children. This is obviously not
1346 very common, so we do not lose much. An example that triggers
1347 this case is the sed "script" /\(\)/x. */
1348 && node->left != NULL
1349 && (node->token.opr.idx >= BITSET_WORD_BITS
1350 || !(dfa->used_bkref_map
1351 & ((bitset_word_t) 1 << node->token.opr.idx))))
1354 /* Convert the SUBEXP node to the concatenation of an
1355 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1356 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1357 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1358 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1359 tree = create_tree (dfa, op, tree1, CONCAT);
1360 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1366 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1367 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1371 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1372 nodes. Requires a postorder visit. */
1373 static reg_errcode_t
1374 calc_first (void *extra, bin_tree_t *node)
1376 re_dfa_t *dfa = (re_dfa_t *) extra;
1377 if (node->token.type == CONCAT)
1379 node->first = node->left->first;
1380 node->node_idx = node->left->node_idx;
1385 node->node_idx = re_dfa_add_node (dfa, node->token);
1386 if (BE (node->node_idx == -1, 0))
1388 if (node->token.type == ANCHOR)
1389 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1394 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1395 static reg_errcode_t
1396 calc_next (void *extra, bin_tree_t *node)
1398 switch (node->token.type)
1400 case OP_DUP_ASTERISK:
1401 node->left->next = node;
1404 node->left->next = node->right->first;
1405 node->right->next = node->next;
1409 node->left->next = node->next;
1411 node->right->next = node->next;
1417 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1418 static reg_errcode_t
1419 link_nfa_nodes (void *extra, bin_tree_t *node)
1421 re_dfa_t *dfa = (re_dfa_t *) extra;
1422 int idx = node->node_idx;
1423 reg_errcode_t err = REG_NOERROR;
1425 switch (node->token.type)
1431 assert (node->next == NULL);
1434 case OP_DUP_ASTERISK:
1438 dfa->has_plural_match = 1;
1439 if (node->left != NULL)
1440 left = node->left->first->node_idx;
1442 left = node->next->node_idx;
1443 if (node->right != NULL)
1444 right = node->right->first->node_idx;
1446 right = node->next->node_idx;
1448 assert (right > -1);
1449 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1454 case OP_OPEN_SUBEXP:
1455 case OP_CLOSE_SUBEXP:
1456 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1460 dfa->nexts[idx] = node->next->node_idx;
1461 if (node->token.type == OP_BACK_REF)
1462 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1466 assert (!IS_EPSILON_NODE (node->token.type));
1467 dfa->nexts[idx] = node->next->node_idx;
1474 /* Duplicate the epsilon closure of the node ROOT_NODE.
1475 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1476 to their own constraint. */
1478 static reg_errcode_t
1480 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1481 int root_node, unsigned int init_constraint)
1483 int org_node, clone_node, ret;
1484 unsigned int constraint = init_constraint;
1485 for (org_node = top_org_node, clone_node = top_clone_node;;)
1487 int org_dest, clone_dest;
1488 if (dfa->nodes[org_node].type == OP_BACK_REF)
1490 /* If the back reference epsilon-transit, its destination must
1491 also have the constraint. Then duplicate the epsilon closure
1492 of the destination of the back reference, and store it in
1493 edests of the back reference. */
1494 org_dest = dfa->nexts[org_node];
1495 re_node_set_empty (dfa->edests + clone_node);
1496 clone_dest = duplicate_node (dfa, org_dest, constraint);
1497 if (BE (clone_dest == -1, 0))
1499 dfa->nexts[clone_node] = dfa->nexts[org_node];
1500 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1501 if (BE (ret < 0, 0))
1504 else if (dfa->edests[org_node].nelem == 0)
1506 /* In case of the node can't epsilon-transit, don't duplicate the
1507 destination and store the original destination as the
1508 destination of the node. */
1509 dfa->nexts[clone_node] = dfa->nexts[org_node];
1512 else if (dfa->edests[org_node].nelem == 1)
1514 /* In case of the node can epsilon-transit, and it has only one
1516 org_dest = dfa->edests[org_node].elems[0];
1517 re_node_set_empty (dfa->edests + clone_node);
1518 /* If the node is root_node itself, it means the epsilon clsoure
1519 has a loop. Then tie it to the destination of the root_node. */
1520 if (org_node == root_node && clone_node != org_node)
1522 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1523 if (BE (ret < 0, 0))
1527 /* In case of the node has another constraint, add it. */
1528 constraint |= dfa->nodes[org_node].constraint;
1529 clone_dest = duplicate_node (dfa, org_dest, constraint);
1530 if (BE (clone_dest == -1, 0))
1532 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1533 if (BE (ret < 0, 0))
1536 else /* dfa->edests[org_node].nelem == 2 */
1538 /* In case of the node can epsilon-transit, and it has two
1539 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1540 org_dest = dfa->edests[org_node].elems[0];
1541 re_node_set_empty (dfa->edests + clone_node);
1542 /* Search for a duplicated node which satisfies the constraint. */
1543 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1544 if (clone_dest == -1)
1546 /* There is no such duplicated node, create a new one. */
1548 clone_dest = duplicate_node (dfa, org_dest, constraint);
1549 if (BE (clone_dest == -1, 0))
1551 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552 if (BE (ret < 0, 0))
1554 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1555 root_node, constraint);
1556 if (BE (err != REG_NOERROR, 0))
1561 /* There is a duplicated node which satisfies the constraint,
1562 use it to avoid infinite loop. */
1563 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564 if (BE (ret < 0, 0))
1568 org_dest = dfa->edests[org_node].elems[1];
1569 clone_dest = duplicate_node (dfa, org_dest, constraint);
1570 if (BE (clone_dest == -1, 0))
1572 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573 if (BE (ret < 0, 0))
1576 org_node = org_dest;
1577 clone_node = clone_dest;
1582 /* Search for a node which is duplicated from the node ORG_NODE, and
1583 satisfies the constraint CONSTRAINT. */
1586 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1587 unsigned int constraint)
1590 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1592 if (org_node == dfa->org_indices[idx]
1593 && constraint == dfa->nodes[idx].constraint)
1594 return idx; /* Found. */
1596 return -1; /* Not found. */
1599 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1600 Return the index of the new node, or -1 if insufficient storage is
1604 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1606 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1607 if (BE (dup_idx != -1, 1))
1609 dfa->nodes[dup_idx].constraint = constraint;
1610 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1611 dfa->nodes[dup_idx].duplicated = 1;
1613 /* Store the index of the original node. */
1614 dfa->org_indices[dup_idx] = org_idx;
1619 static reg_errcode_t
1620 calc_inveclosure (re_dfa_t *dfa)
1623 for (idx = 0; idx < dfa->nodes_len; ++idx)
1624 re_node_set_init_empty (dfa->inveclosures + idx);
1626 for (src = 0; src < dfa->nodes_len; ++src)
1628 int *elems = dfa->eclosures[src].elems;
1629 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1631 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1632 if (BE (ret == -1, 0))
1640 /* Calculate "eclosure" for all the node in DFA. */
1642 static reg_errcode_t
1643 calc_eclosure (re_dfa_t *dfa)
1645 int node_idx, incomplete;
1647 assert (dfa->nodes_len > 0);
1650 /* For each nodes, calculate epsilon closure. */
1651 for (node_idx = 0; ; ++node_idx)
1654 re_node_set eclosure_elem;
1655 if (node_idx == dfa->nodes_len)
1664 assert (dfa->eclosures[node_idx].nelem != -1);
1667 /* If we have already calculated, skip it. */
1668 if (dfa->eclosures[node_idx].nelem != 0)
1670 /* Calculate epsilon closure of `node_idx'. */
1671 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1672 if (BE (err != REG_NOERROR, 0))
1675 if (dfa->eclosures[node_idx].nelem == 0)
1678 re_node_set_free (&eclosure_elem);
1684 /* Calculate epsilon closure of NODE. */
1686 static reg_errcode_t
1687 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1691 re_node_set eclosure;
1694 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1695 if (BE (err != REG_NOERROR, 0))
1698 /* This indicates that we are calculating this node now.
1699 We reference this value to avoid infinite loop. */
1700 dfa->eclosures[node].nelem = -1;
1702 /* If the current node has constraints, duplicate all nodes
1703 since they must inherit the constraints. */
1704 if (dfa->nodes[node].constraint
1705 && dfa->edests[node].nelem
1706 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1708 err = duplicate_node_closure (dfa, node, node, node,
1709 dfa->nodes[node].constraint);
1710 if (BE (err != REG_NOERROR, 0))
1714 /* Expand each epsilon destination nodes. */
1715 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1716 for (i = 0; i < dfa->edests[node].nelem; ++i)
1718 re_node_set eclosure_elem;
1719 int edest = dfa->edests[node].elems[i];
1720 /* If calculating the epsilon closure of `edest' is in progress,
1721 return intermediate result. */
1722 if (dfa->eclosures[edest].nelem == -1)
1727 /* If we haven't calculated the epsilon closure of `edest' yet,
1728 calculate now. Otherwise use calculated epsilon closure. */
1729 if (dfa->eclosures[edest].nelem == 0)
1731 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1732 if (BE (err != REG_NOERROR, 0))
1736 eclosure_elem = dfa->eclosures[edest];
1737 /* Merge the epsilon closure of `edest'. */
1738 err = re_node_set_merge (&eclosure, &eclosure_elem);
1739 if (BE (err != REG_NOERROR, 0))
1741 /* If the epsilon closure of `edest' is incomplete,
1742 the epsilon closure of this node is also incomplete. */
1743 if (dfa->eclosures[edest].nelem == 0)
1746 re_node_set_free (&eclosure_elem);
1750 /* An epsilon closure includes itself. */
1751 ret = re_node_set_insert (&eclosure, node);
1752 if (BE (ret < 0, 0))
1754 if (incomplete && !root)
1755 dfa->eclosures[node].nelem = 0;
1757 dfa->eclosures[node] = eclosure;
1758 *new_set = eclosure;
1762 /* Functions for token which are used in the parser. */
1764 /* Fetch a token from INPUT.
1765 We must not use this function inside bracket expressions. */
1769 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1771 re_string_skip_bytes (input, peek_token (result, input, syntax));
1774 /* Peek a token from INPUT, and return the length of the token.
1775 We must not use this function inside bracket expressions. */
1779 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1783 if (re_string_eoi (input))
1785 token->type = END_OF_RE;
1789 c = re_string_peek_byte (input, 0);
1792 token->word_char = 0;
1793 #ifdef RE_ENABLE_I18N
1794 token->mb_partial = 0;
1795 if (input->mb_cur_max > 1 &&
1796 !re_string_first_byte (input, re_string_cur_idx (input)))
1798 token->type = CHARACTER;
1799 token->mb_partial = 1;
1806 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1808 token->type = BACK_SLASH;
1812 c2 = re_string_peek_byte_case (input, 1);
1814 token->type = CHARACTER;
1815 #ifdef RE_ENABLE_I18N
1816 if (input->mb_cur_max > 1)
1818 wint_t wc = re_string_wchar_at (input,
1819 re_string_cur_idx (input) + 1);
1820 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1824 token->word_char = IS_WORD_CHAR (c2) != 0;
1829 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1830 token->type = OP_ALT;
1832 case '1': case '2': case '3': case '4': case '5':
1833 case '6': case '7': case '8': case '9':
1834 if (!(syntax & RE_NO_BK_REFS))
1836 token->type = OP_BACK_REF;
1837 token->opr.idx = c2 - '1';
1841 if (!(syntax & RE_NO_GNU_OPS))
1843 token->type = ANCHOR;
1844 token->opr.ctx_type = WORD_FIRST;
1848 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = WORD_LAST;
1855 if (!(syntax & RE_NO_GNU_OPS))
1857 token->type = ANCHOR;
1858 token->opr.ctx_type = WORD_DELIM;
1862 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = ANCHOR;
1865 token->opr.ctx_type = NOT_WORD_DELIM;
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_WORD;
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_NOTWORD;
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_SPACE;
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = OP_NOTSPACE;
1885 if (!(syntax & RE_NO_GNU_OPS))
1887 token->type = ANCHOR;
1888 token->opr.ctx_type = BUF_FIRST;
1892 if (!(syntax & RE_NO_GNU_OPS))
1894 token->type = ANCHOR;
1895 token->opr.ctx_type = BUF_LAST;
1899 if (!(syntax & RE_NO_BK_PARENS))
1900 token->type = OP_OPEN_SUBEXP;
1903 if (!(syntax & RE_NO_BK_PARENS))
1904 token->type = OP_CLOSE_SUBEXP;
1907 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908 token->type = OP_DUP_PLUS;
1911 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912 token->type = OP_DUP_QUESTION;
1915 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916 token->type = OP_OPEN_DUP_NUM;
1919 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920 token->type = OP_CLOSE_DUP_NUM;
1928 token->type = CHARACTER;
1929 #ifdef RE_ENABLE_I18N
1930 if (input->mb_cur_max > 1)
1932 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1933 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1937 token->word_char = IS_WORD_CHAR (token->opr.c);
1942 if (syntax & RE_NEWLINE_ALT)
1943 token->type = OP_ALT;
1946 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1947 token->type = OP_ALT;
1950 token->type = OP_DUP_ASTERISK;
1953 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1954 token->type = OP_DUP_PLUS;
1957 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1958 token->type = OP_DUP_QUESTION;
1961 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1962 token->type = OP_OPEN_DUP_NUM;
1965 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1966 token->type = OP_CLOSE_DUP_NUM;
1969 if (syntax & RE_NO_BK_PARENS)
1970 token->type = OP_OPEN_SUBEXP;
1973 if (syntax & RE_NO_BK_PARENS)
1974 token->type = OP_CLOSE_SUBEXP;
1977 token->type = OP_OPEN_BRACKET;
1980 token->type = OP_PERIOD;
1983 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1984 re_string_cur_idx (input) != 0)
1986 char prev = re_string_peek_byte (input, -1);
1987 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1990 token->type = ANCHOR;
1991 token->opr.ctx_type = LINE_FIRST;
1994 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1995 re_string_cur_idx (input) + 1 != re_string_length (input))
1998 re_string_skip_bytes (input, 1);
1999 peek_token (&next, input, syntax);
2000 re_string_skip_bytes (input, -1);
2001 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2004 token->type = ANCHOR;
2005 token->opr.ctx_type = LINE_LAST;
2013 /* Peek a token from INPUT, and return the length of the token.
2014 We must not use this function out of bracket expressions. */
2018 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2021 if (re_string_eoi (input))
2023 token->type = END_OF_RE;
2026 c = re_string_peek_byte (input, 0);
2029 #ifdef RE_ENABLE_I18N
2030 if (input->mb_cur_max > 1 &&
2031 !re_string_first_byte (input, re_string_cur_idx (input)))
2033 token->type = CHARACTER;
2036 #endif /* RE_ENABLE_I18N */
2038 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2039 && re_string_cur_idx (input) + 1 < re_string_length (input))
2041 /* In this case, '\' escape a character. */
2043 re_string_skip_bytes (input, 1);
2044 c2 = re_string_peek_byte (input, 0);
2046 token->type = CHARACTER;
2049 if (c == '[') /* '[' is a special char in a bracket exps. */
2053 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2054 c2 = re_string_peek_byte (input, 1);
2062 token->type = OP_OPEN_COLL_ELEM;
2065 token->type = OP_OPEN_EQUIV_CLASS;
2068 if (syntax & RE_CHAR_CLASSES)
2070 token->type = OP_OPEN_CHAR_CLASS;
2073 /* else fall through. */
2075 token->type = CHARACTER;
2085 token->type = OP_CHARSET_RANGE;
2088 token->type = OP_CLOSE_BRACKET;
2091 token->type = OP_NON_MATCH_LIST;
2094 token->type = CHARACTER;
2099 /* Functions for parser. */
2101 /* Entry point of the parser.
2102 Parse the regular expression REGEXP and return the structure tree.
2103 If an error is occured, ERR is set by error code, and return NULL.
2104 This function build the following tree, from regular expression <reg_exp>:
2110 CAT means concatenation.
2111 EOR means end of regular expression. */
2114 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2117 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2118 bin_tree_t *tree, *eor, *root;
2119 re_token_t current_token;
2120 dfa->syntax = syntax;
2121 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2122 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2123 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2125 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2127 root = create_tree (dfa, tree, eor, CONCAT);
2130 if (BE (eor == NULL || root == NULL, 0))
2138 /* This function build the following tree, from regular expression
2139 <branch1>|<branch2>:
2145 ALT means alternative, which represents the operator `|'. */
2148 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2149 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2151 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2152 bin_tree_t *tree, *branch = NULL;
2153 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2154 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2157 while (token->type == OP_ALT)
2159 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2160 if (token->type != OP_ALT && token->type != END_OF_RE
2161 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2163 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2164 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2169 tree = create_tree (dfa, tree, branch, OP_ALT);
2170 if (BE (tree == NULL, 0))
2179 /* This function build the following tree, from regular expression
2186 CAT means concatenation. */
2189 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2190 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2192 bin_tree_t *tree, *exp;
2193 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2194 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2195 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2198 while (token->type != OP_ALT && token->type != END_OF_RE
2199 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2201 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2202 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2206 if (tree != NULL && exp != NULL)
2208 tree = create_tree (dfa, tree, exp, CONCAT);
2215 else if (tree == NULL)
2217 /* Otherwise exp == NULL, we don't need to create new tree. */
2222 /* This function build the following tree, from regular expression a*:
2229 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2230 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2232 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2234 switch (token->type)
2237 tree = create_token_tree (dfa, NULL, NULL, token);
2238 if (BE (tree == NULL, 0))
2243 #ifdef RE_ENABLE_I18N
2244 if (dfa->mb_cur_max > 1)
2246 while (!re_string_eoi (regexp)
2247 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2249 bin_tree_t *mbc_remain;
2250 fetch_token (token, regexp, syntax);
2251 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2252 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2253 if (BE (mbc_remain == NULL || tree == NULL, 0))
2262 case OP_OPEN_SUBEXP:
2263 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2264 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267 case OP_OPEN_BRACKET:
2268 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2269 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2273 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2278 dfa->used_bkref_map |= 1 << token->opr.idx;
2279 tree = create_token_tree (dfa, NULL, NULL, token);
2280 if (BE (tree == NULL, 0))
2286 dfa->has_mb_node = 1;
2288 case OP_OPEN_DUP_NUM:
2289 if (syntax & RE_CONTEXT_INVALID_DUP)
2295 case OP_DUP_ASTERISK:
2297 case OP_DUP_QUESTION:
2298 if (syntax & RE_CONTEXT_INVALID_OPS)
2303 else if (syntax & RE_CONTEXT_INDEP_OPS)
2305 fetch_token (token, regexp, syntax);
2306 return parse_expression (regexp, preg, token, syntax, nest, err);
2308 /* else fall through */
2309 case OP_CLOSE_SUBEXP:
2310 if ((token->type == OP_CLOSE_SUBEXP) &&
2311 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2316 /* else fall through */
2317 case OP_CLOSE_DUP_NUM:
2318 /* We treat it as a normal character. */
2320 /* Then we can these characters as normal characters. */
2321 token->type = CHARACTER;
2322 /* mb_partial and word_char bits should be initialized already
2324 tree = create_token_tree (dfa, NULL, NULL, token);
2325 if (BE (tree == NULL, 0))
2332 if ((token->opr.ctx_type
2333 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2334 && dfa->word_ops_used == 0)
2335 init_word_char (dfa);
2336 if (token->opr.ctx_type == WORD_DELIM
2337 || token->opr.ctx_type == NOT_WORD_DELIM)
2339 bin_tree_t *tree_first, *tree_last;
2340 if (token->opr.ctx_type == WORD_DELIM)
2342 token->opr.ctx_type = WORD_FIRST;
2343 tree_first = create_token_tree (dfa, NULL, NULL, token);
2344 token->opr.ctx_type = WORD_LAST;
2348 token->opr.ctx_type = INSIDE_WORD;
2349 tree_first = create_token_tree (dfa, NULL, NULL, token);
2350 token->opr.ctx_type = INSIDE_NOTWORD;
2352 tree_last = create_token_tree (dfa, NULL, NULL, token);
2353 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2354 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2362 tree = create_token_tree (dfa, NULL, NULL, token);
2363 if (BE (tree == NULL, 0))
2369 /* We must return here, since ANCHORs can't be followed
2370 by repetition operators.
2371 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2372 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2373 fetch_token (token, regexp, syntax);
2376 tree = create_token_tree (dfa, NULL, NULL, token);
2377 if (BE (tree == NULL, 0))
2382 if (dfa->mb_cur_max > 1)
2383 dfa->has_mb_node = 1;
2387 tree = build_charclass_op (dfa, regexp->trans,
2390 token->type == OP_NOTWORD, err);
2391 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396 tree = build_charclass_op (dfa, regexp->trans,
2399 token->type == OP_NOTSPACE, err);
2400 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2410 /* Must not happen? */
2416 fetch_token (token, regexp, syntax);
2418 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2419 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2421 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2422 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2424 /* In BRE consecutive duplications are not allowed. */
2425 if ((syntax & RE_CONTEXT_INVALID_DUP)
2426 && (token->type == OP_DUP_ASTERISK
2427 || token->type == OP_OPEN_DUP_NUM))
2437 /* This function build the following tree, from regular expression
2445 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2446 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2448 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2451 cur_nsub = preg->re_nsub++;
2453 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2455 /* The subexpression may be a null string. */
2456 if (token->type == OP_CLOSE_SUBEXP)
2460 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2461 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2463 if (BE (*err != REG_NOERROR, 0))
2467 if (cur_nsub <= '9' - '1')
2468 dfa->completed_bkref_map |= 1 << cur_nsub;
2470 tree = create_tree (dfa, tree, NULL, SUBEXP);
2471 if (BE (tree == NULL, 0))
2476 tree->token.opr.idx = cur_nsub;
2480 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2483 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2484 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2486 bin_tree_t *tree = NULL, *old_tree = NULL;
2487 int i, start, end, start_idx = re_string_cur_idx (regexp);
2488 #ifndef RE_TOKEN_INIT_BUG
2489 re_token_t start_token = *token;
2491 re_token_t start_token;
2493 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2496 if (token->type == OP_OPEN_DUP_NUM)
2499 start = fetch_number (regexp, token, syntax);
2502 if (token->type == CHARACTER && token->opr.c == ',')
2503 start = 0; /* We treat "{,m}" as "{0,m}". */
2506 *err = REG_BADBR; /* <re>{} is invalid. */
2510 if (BE (start != -2, 1))
2512 /* We treat "{n}" as "{n,n}". */
2513 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2514 : ((token->type == CHARACTER && token->opr.c == ',')
2515 ? fetch_number (regexp, token, syntax) : -2));
2517 if (BE (start == -2 || end == -2, 0))
2519 /* Invalid sequence. */
2520 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2522 if (token->type == END_OF_RE)
2530 /* If the syntax bit is set, rollback. */
2531 re_string_set_index (regexp, start_idx);
2532 *token = start_token;
2533 token->type = CHARACTER;
2534 /* mb_partial and word_char bits should be already initialized by
2539 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2541 /* First number greater than second. */
2548 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2549 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2552 fetch_token (token, regexp, syntax);
2554 if (BE (elem == NULL, 0))
2556 if (BE (start == 0 && end == 0, 0))
2558 postorder (elem, free_tree, NULL);
2562 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2563 if (BE (start > 0, 0))
2566 for (i = 2; i <= start; ++i)
2568 elem = duplicate_tree (elem, dfa);
2569 tree = create_tree (dfa, tree, elem, CONCAT);
2570 if (BE (elem == NULL || tree == NULL, 0))
2571 goto parse_dup_op_espace;
2577 /* Duplicate ELEM before it is marked optional. */
2578 elem = duplicate_tree (elem, dfa);
2584 if (elem->token.type == SUBEXP)
2585 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2587 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2588 if (BE (tree == NULL, 0))
2589 goto parse_dup_op_espace;
2591 /* This loop is actually executed only when end != -1,
2592 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2593 already created the start+1-th copy. */
2594 for (i = start + 2; i <= end; ++i)
2596 elem = duplicate_tree (elem, dfa);
2597 tree = create_tree (dfa, tree, elem, CONCAT);
2598 if (BE (elem == NULL || tree == NULL, 0))
2599 goto parse_dup_op_espace;
2601 tree = create_tree (dfa, tree, NULL, OP_ALT);
2602 if (BE (tree == NULL, 0))
2603 goto parse_dup_op_espace;
2607 tree = create_tree (dfa, old_tree, tree, CONCAT);
2611 parse_dup_op_espace:
2616 /* Size of the names for collating symbol/equivalence_class/character_class.
2617 I'm not sure, but maybe enough. */
2618 #define BRACKET_NAME_BUF_SIZE 32
2621 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2622 Build the range expression which starts from START_ELEM, and ends
2623 at END_ELEM. The result are written to MBCSET and SBCSET.
2624 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2625 mbcset->range_ends, is a pointer argument sinse we may
2628 static reg_errcode_t
2630 # ifdef RE_ENABLE_I18N
2631 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2632 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2635 bracket_elem_t *end_elem)
2636 # endif /* not RE_ENABLE_I18N */
2638 unsigned int start_ch, end_ch;
2639 /* Equivalence Classes and Character Classes can't be a range start/end. */
2640 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2645 /* We can handle no multi character collating elements without libc
2647 if (BE ((start_elem->type == COLL_SYM
2648 && strlen ((char *) start_elem->opr.name) > 1)
2649 || (end_elem->type == COLL_SYM
2650 && strlen ((char *) end_elem->opr.name) > 1), 0))
2651 return REG_ECOLLATE;
2653 # ifdef RE_ENABLE_I18N
2658 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2660 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2663 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2668 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2669 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2670 * unsigned, so we don't have sign extension problems.
2672 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2673 ? start_ch : start_elem->opr.wch);
2674 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2675 ? end_ch : end_elem->opr.wch);
2677 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2678 ? __btowc (start_ch) : start_elem->opr.wch);
2679 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2680 ? __btowc (end_ch) : end_elem->opr.wch);
2682 if (start_wc == WEOF || end_wc == WEOF)
2683 return REG_ECOLLATE;
2684 cmp_buf[0] = start_wc;
2685 cmp_buf[4] = end_wc;
2686 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2689 /* Got valid collation sequence values, add them as a new entry.
2690 However, for !_LIBC we have no collation elements: if the
2691 character set is single byte, the single byte character set
2692 that we build below suffices. parse_bracket_exp passes
2693 no MBCSET if dfa->mb_cur_max == 1. */
2696 /* Check the space of the arrays. */
2697 if (BE (*range_alloc == mbcset->nranges, 0))
2699 /* There is not enough space, need realloc. */
2700 wchar_t *new_array_start, *new_array_end;
2703 /* +1 in case of mbcset->nranges is 0. */
2704 new_nranges = 2 * mbcset->nranges + 1;
2705 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706 are NULL if *range_alloc == 0. */
2707 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2709 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2712 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2715 mbcset->range_starts = new_array_start;
2716 mbcset->range_ends = new_array_end;
2717 *range_alloc = new_nranges;
2720 mbcset->range_starts[mbcset->nranges] = start_wc;
2721 mbcset->range_ends[mbcset->nranges++] = end_wc;
2724 /* Build the table for single byte characters. */
2725 for (wc = 0; wc < SBC_MAX; ++wc)
2728 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2729 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2730 bitset_set (sbcset, wc);
2733 # else /* not RE_ENABLE_I18N */
2736 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2737 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2739 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2740 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2742 if (start_ch > end_ch)
2744 /* Build the table for single byte characters. */
2745 for (ch = 0; ch < SBC_MAX; ++ch)
2746 if (start_ch <= ch && ch <= end_ch)
2747 bitset_set (sbcset, ch);
2749 # endif /* not RE_ENABLE_I18N */
2752 #endif /* not _LIBC */
2755 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2756 Build the collating element which is represented by NAME.
2757 The result are written to MBCSET and SBCSET.
2758 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2759 pointer argument since we may update it. */
2761 static reg_errcode_t
2763 # ifdef RE_ENABLE_I18N
2764 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2765 int *coll_sym_alloc, const unsigned char *name)
2766 # else /* not RE_ENABLE_I18N */
2767 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2768 # endif /* not RE_ENABLE_I18N */
2770 size_t name_len = strlen ((const char *) name);
2771 if (BE (name_len != 1, 0))
2772 return REG_ECOLLATE;
2775 bitset_set (sbcset, name[0]);
2779 #endif /* not _LIBC */
2781 /* This function parse bracket expression like "[abc]", "[a-c]",
2785 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2786 reg_syntax_t syntax, reg_errcode_t *err)
2789 const unsigned char *collseqmb;
2790 const char *collseqwc;
2793 const int32_t *symb_table;
2794 const unsigned char *extra;
2796 /* Local function for parse_bracket_exp used in _LIBC environement.
2797 Seek the collating symbol entry correspondings to NAME.
2798 Return the index of the symbol in the SYMB_TABLE. */
2801 __attribute ((always_inline))
2802 seek_collating_symbol_entry (name, name_len)
2803 const unsigned char *name;
2806 int32_t hash = elem_hash ((const char *) name, name_len);
2807 int32_t elem = hash % table_size;
2808 if (symb_table[2 * elem] != 0)
2810 int32_t second = hash % (table_size - 2) + 1;
2814 /* First compare the hashing value. */
2815 if (symb_table[2 * elem] == hash
2816 /* Compare the length of the name. */
2817 && name_len == extra[symb_table[2 * elem + 1]]
2818 /* Compare the name. */
2819 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2822 /* Yep, this is the entry. */
2829 while (symb_table[2 * elem] != 0);
2834 /* Local function for parse_bracket_exp used in _LIBC environment.
2835 Look up the collation sequence value of BR_ELEM.
2836 Return the value if succeeded, UINT_MAX otherwise. */
2838 auto inline unsigned int
2839 __attribute ((always_inline))
2840 lookup_collation_sequence_value (br_elem)
2841 bracket_elem_t *br_elem;
2843 if (br_elem->type == SB_CHAR)
2846 if (MB_CUR_MAX == 1)
2849 return collseqmb[br_elem->opr.ch];
2852 wint_t wc = __btowc (br_elem->opr.ch);
2853 return __collseq_table_lookup (collseqwc, wc);
2856 else if (br_elem->type == MB_CHAR)
2859 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2861 else if (br_elem->type == COLL_SYM)
2863 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2867 elem = seek_collating_symbol_entry (br_elem->opr.name,
2869 if (symb_table[2 * elem] != 0)
2871 /* We found the entry. */
2872 idx = symb_table[2 * elem + 1];
2873 /* Skip the name of collating element name. */
2874 idx += 1 + extra[idx];
2875 /* Skip the byte sequence of the collating element. */
2876 idx += 1 + extra[idx];
2877 /* Adjust for the alignment. */
2878 idx = (idx + 3) & ~3;
2879 /* Skip the multibyte collation sequence value. */
2880 idx += sizeof (unsigned int);
2881 /* Skip the wide char sequence of the collating element. */
2882 idx += sizeof (unsigned int) *
2883 (1 + *(unsigned int *) (extra + idx));
2884 /* Return the collation sequence value. */
2885 return *(unsigned int *) (extra + idx);
2887 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2889 /* No valid character. Match it as a single byte
2891 return collseqmb[br_elem->opr.name[0]];
2894 else if (sym_name_len == 1)
2895 return collseqmb[br_elem->opr.name[0]];
2900 /* Local function for parse_bracket_exp used in _LIBC environement.
2901 Build the range expression which starts from START_ELEM, and ends
2902 at END_ELEM. The result are written to MBCSET and SBCSET.
2903 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2904 mbcset->range_ends, is a pointer argument sinse we may
2907 auto inline reg_errcode_t
2908 __attribute ((always_inline))
2909 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2910 re_charset_t *mbcset;
2913 bracket_elem_t *start_elem, *end_elem;
2916 uint32_t start_collseq;
2917 uint32_t end_collseq;
2919 /* Equivalence Classes and Character Classes can't be a range
2921 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2922 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2926 start_collseq = lookup_collation_sequence_value (start_elem);
2927 end_collseq = lookup_collation_sequence_value (end_elem);
2928 /* Check start/end collation sequence values. */
2929 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2930 return REG_ECOLLATE;
2931 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2934 /* Got valid collation sequence values, add them as a new entry.
2935 However, if we have no collation elements, and the character set
2936 is single byte, the single byte character set that we
2937 build below suffices. */
2938 if (nrules > 0 || dfa->mb_cur_max > 1)
2940 /* Check the space of the arrays. */
2941 if (BE (*range_alloc == mbcset->nranges, 0))
2943 /* There is not enough space, need realloc. */
2944 uint32_t *new_array_start;
2945 uint32_t *new_array_end;
2948 /* +1 in case of mbcset->nranges is 0. */
2949 new_nranges = 2 * mbcset->nranges + 1;
2950 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2952 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2955 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2958 mbcset->range_starts = new_array_start;
2959 mbcset->range_ends = new_array_end;
2960 *range_alloc = new_nranges;
2963 mbcset->range_starts[mbcset->nranges] = start_collseq;
2964 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2967 /* Build the table for single byte characters. */
2968 for (ch = 0; ch < SBC_MAX; ch++)
2970 uint32_t ch_collseq;
2972 if (MB_CUR_MAX == 1)
2975 ch_collseq = collseqmb[ch];
2977 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2978 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2979 bitset_set (sbcset, ch);
2984 /* Local function for parse_bracket_exp used in _LIBC environement.
2985 Build the collating element which is represented by NAME.
2986 The result are written to MBCSET and SBCSET.
2987 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2988 pointer argument sinse we may update it. */
2990 auto inline reg_errcode_t
2991 __attribute ((always_inline))
2992 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2993 re_charset_t *mbcset;
2994 int *coll_sym_alloc;
2996 const unsigned char *name;
2999 size_t name_len = strlen ((const char *) name);
3002 elem = seek_collating_symbol_entry (name, name_len);
3003 if (symb_table[2 * elem] != 0)
3005 /* We found the entry. */
3006 idx = symb_table[2 * elem + 1];
3007 /* Skip the name of collating element name. */
3008 idx += 1 + extra[idx];
3010 else if (symb_table[2 * elem] == 0 && name_len == 1)
3012 /* No valid character, treat it as a normal
3014 bitset_set (sbcset, name[0]);
3018 return REG_ECOLLATE;
3020 /* Got valid collation sequence, add it as a new entry. */
3021 /* Check the space of the arrays. */
3022 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3024 /* Not enough, realloc it. */
3025 /* +1 in case of mbcset->ncoll_syms is 0. */
3026 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3027 /* Use realloc since mbcset->coll_syms is NULL
3029 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3030 new_coll_sym_alloc);
3031 if (BE (new_coll_syms == NULL, 0))
3033 mbcset->coll_syms = new_coll_syms;
3034 *coll_sym_alloc = new_coll_sym_alloc;
3036 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3041 if (BE (name_len != 1, 0))
3042 return REG_ECOLLATE;
3045 bitset_set (sbcset, name[0]);
3052 re_token_t br_token;
3053 re_bitset_ptr_t sbcset;
3054 #ifdef RE_ENABLE_I18N
3055 re_charset_t *mbcset;
3056 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3057 int equiv_class_alloc = 0, char_class_alloc = 0;
3058 #endif /* not RE_ENABLE_I18N */
3060 bin_tree_t *work_tree;
3062 int first_round = 1;
3064 collseqmb = (const unsigned char *)
3065 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3066 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3072 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3073 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3074 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3075 _NL_COLLATE_SYMB_TABLEMB);
3076 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3077 _NL_COLLATE_SYMB_EXTRAMB);
3080 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3081 #ifdef RE_ENABLE_I18N
3082 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3083 #endif /* RE_ENABLE_I18N */
3084 #ifdef RE_ENABLE_I18N
3085 if (BE (sbcset == NULL || mbcset == NULL, 0))
3087 if (BE (sbcset == NULL, 0))
3088 #endif /* RE_ENABLE_I18N */
3094 token_len = peek_token_bracket (token, regexp, syntax);
3095 if (BE (token->type == END_OF_RE, 0))
3098 goto parse_bracket_exp_free_return;
3100 if (token->type == OP_NON_MATCH_LIST)
3102 #ifdef RE_ENABLE_I18N
3103 mbcset->non_match = 1;
3104 #endif /* not RE_ENABLE_I18N */
3106 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3107 bitset_set (sbcset, '\n');
3108 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3109 token_len = peek_token_bracket (token, regexp, syntax);
3110 if (BE (token->type == END_OF_RE, 0))
3113 goto parse_bracket_exp_free_return;
3117 /* We treat the first ']' as a normal character. */
3118 if (token->type == OP_CLOSE_BRACKET)
3119 token->type = CHARACTER;
3123 bracket_elem_t start_elem, end_elem;
3124 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3125 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3127 int token_len2 = 0, is_range_exp = 0;
3130 start_elem.opr.name = start_name_buf;
3131 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3132 syntax, first_round);
3133 if (BE (ret != REG_NOERROR, 0))
3136 goto parse_bracket_exp_free_return;
3140 /* Get information about the next token. We need it in any case. */
3141 token_len = peek_token_bracket (token, regexp, syntax);
3143 /* Do not check for ranges if we know they are not allowed. */
3144 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3146 if (BE (token->type == END_OF_RE, 0))
3149 goto parse_bracket_exp_free_return;
3151 if (token->type == OP_CHARSET_RANGE)
3153 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3154 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3155 if (BE (token2.type == END_OF_RE, 0))
3158 goto parse_bracket_exp_free_return;
3160 if (token2.type == OP_CLOSE_BRACKET)
3162 /* We treat the last '-' as a normal character. */
3163 re_string_skip_bytes (regexp, -token_len);
3164 token->type = CHARACTER;
3171 if (is_range_exp == 1)
3173 end_elem.opr.name = end_name_buf;
3174 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3176 if (BE (ret != REG_NOERROR, 0))
3179 goto parse_bracket_exp_free_return;
3182 token_len = peek_token_bracket (token, regexp, syntax);
3185 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3186 &start_elem, &end_elem);
3188 # ifdef RE_ENABLE_I18N
3189 *err = build_range_exp (sbcset,
3190 dfa->mb_cur_max > 1 ? mbcset : NULL,
3191 &range_alloc, &start_elem, &end_elem);
3193 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3195 #endif /* RE_ENABLE_I18N */
3196 if (BE (*err != REG_NOERROR, 0))
3197 goto parse_bracket_exp_free_return;
3201 switch (start_elem.type)
3204 bitset_set (sbcset, start_elem.opr.ch);
3206 #ifdef RE_ENABLE_I18N
3208 /* Check whether the array has enough space. */
3209 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3211 wchar_t *new_mbchars;
3212 /* Not enough, realloc it. */
3213 /* +1 in case of mbcset->nmbchars is 0. */
3214 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3215 /* Use realloc since array is NULL if *alloc == 0. */
3216 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3218 if (BE (new_mbchars == NULL, 0))
3219 goto parse_bracket_exp_espace;
3220 mbcset->mbchars = new_mbchars;
3222 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3224 #endif /* RE_ENABLE_I18N */
3226 *err = build_equiv_class (sbcset,
3227 #ifdef RE_ENABLE_I18N
3228 mbcset, &equiv_class_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230 start_elem.opr.name);
3231 if (BE (*err != REG_NOERROR, 0))
3232 goto parse_bracket_exp_free_return;
3235 *err = build_collating_symbol (sbcset,
3236 #ifdef RE_ENABLE_I18N
3237 mbcset, &coll_sym_alloc,
3238 #endif /* RE_ENABLE_I18N */
3239 start_elem.opr.name);
3240 if (BE (*err != REG_NOERROR, 0))
3241 goto parse_bracket_exp_free_return;
3244 *err = build_charclass (regexp->trans, sbcset,
3245 #ifdef RE_ENABLE_I18N
3246 mbcset, &char_class_alloc,
3247 #endif /* RE_ENABLE_I18N */
3248 (const char *) start_elem.opr.name, syntax);
3249 if (BE (*err != REG_NOERROR, 0))
3250 goto parse_bracket_exp_free_return;
3257 if (BE (token->type == END_OF_RE, 0))
3260 goto parse_bracket_exp_free_return;
3262 if (token->type == OP_CLOSE_BRACKET)
3266 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3268 /* If it is non-matching list. */
3270 bitset_not (sbcset);
3272 #ifdef RE_ENABLE_I18N
3273 /* Ensure only single byte characters are set. */
3274 if (dfa->mb_cur_max > 1)
3275 bitset_mask (sbcset, dfa->sb_char);
3277 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3278 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3279 || mbcset->non_match)))
3281 bin_tree_t *mbc_tree;
3283 /* Build a tree for complex bracket. */
3284 dfa->has_mb_node = 1;
3285 br_token.type = COMPLEX_BRACKET;
3286 br_token.opr.mbcset = mbcset;
3287 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3288 if (BE (mbc_tree == NULL, 0))
3289 goto parse_bracket_exp_espace;
3290 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3291 if (sbcset[sbc_idx])
3293 /* If there are no bits set in sbcset, there is no point
3294 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3295 if (sbc_idx < BITSET_WORDS)
3297 /* Build a tree for simple bracket. */
3298 br_token.type = SIMPLE_BRACKET;
3299 br_token.opr.sbcset = sbcset;
3300 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3301 if (BE (work_tree == NULL, 0))
3302 goto parse_bracket_exp_espace;
3304 /* Then join them by ALT node. */
3305 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3306 if (BE (work_tree == NULL, 0))
3307 goto parse_bracket_exp_espace;
3312 work_tree = mbc_tree;
3316 #endif /* not RE_ENABLE_I18N */
3318 #ifdef RE_ENABLE_I18N
3319 free_charset (mbcset);
3321 /* Build a tree for simple bracket. */
3322 br_token.type = SIMPLE_BRACKET;
3323 br_token.opr.sbcset = sbcset;
3324 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3325 if (BE (work_tree == NULL, 0))
3326 goto parse_bracket_exp_espace;
3330 parse_bracket_exp_espace:
3332 parse_bracket_exp_free_return:
3334 #ifdef RE_ENABLE_I18N
3335 free_charset (mbcset);
3336 #endif /* RE_ENABLE_I18N */
3340 /* Parse an element in the bracket expression. */
3342 static reg_errcode_t
3343 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3344 re_token_t *token, int token_len, re_dfa_t *dfa,
3345 reg_syntax_t syntax, int accept_hyphen)
3347 #ifdef RE_ENABLE_I18N
3349 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3350 if (cur_char_size > 1)
3352 elem->type = MB_CHAR;
3353 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3354 re_string_skip_bytes (regexp, cur_char_size);
3357 #endif /* RE_ENABLE_I18N */
3358 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3359 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3360 || token->type == OP_OPEN_EQUIV_CLASS)
3361 return parse_bracket_symbol (elem, regexp, token);
3362 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3364 /* A '-' must only appear as anything but a range indicator before
3365 the closing bracket. Everything else is an error. */
3367 (void) peek_token_bracket (&token2, regexp, syntax);
3368 if (token2.type != OP_CLOSE_BRACKET)
3369 /* The actual error value is not standardized since this whole
3370 case is undefined. But ERANGE makes good sense. */
3373 elem->type = SB_CHAR;
3374 elem->opr.ch = token->opr.c;
3378 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3379 such as [:<character_class>:], [.<collating_element>.], and
3380 [=<equivalent_class>=]. */
3382 static reg_errcode_t
3383 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3386 unsigned char ch, delim = token->opr.c;
3388 if (re_string_eoi(regexp))
3392 if (i >= BRACKET_NAME_BUF_SIZE)
3394 if (token->type == OP_OPEN_CHAR_CLASS)
3395 ch = re_string_fetch_byte_case (regexp);
3397 ch = re_string_fetch_byte (regexp);
3398 if (re_string_eoi(regexp))
3400 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3402 elem->opr.name[i] = ch;
3404 re_string_skip_bytes (regexp, 1);
3405 elem->opr.name[i] = '\0';
3406 switch (token->type)
3408 case OP_OPEN_COLL_ELEM:
3409 elem->type = COLL_SYM;
3411 case OP_OPEN_EQUIV_CLASS:
3412 elem->type = EQUIV_CLASS;
3414 case OP_OPEN_CHAR_CLASS:
3415 elem->type = CHAR_CLASS;
3423 /* Helper function for parse_bracket_exp.
3424 Build the equivalence class which is represented by NAME.
3425 The result are written to MBCSET and SBCSET.
3426 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3427 is a pointer argument sinse we may update it. */
3429 static reg_errcode_t
3430 #ifdef RE_ENABLE_I18N
3431 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3432 int *equiv_class_alloc, const unsigned char *name)
3433 #else /* not RE_ENABLE_I18N */
3434 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3435 #endif /* not RE_ENABLE_I18N */
3438 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3441 const int32_t *table, *indirect;
3442 const unsigned char *weights, *extra, *cp;
3443 unsigned char char_buf[2];
3447 /* This #include defines a local function! */
3448 # include <locale/weight.h>
3449 /* Calculate the index for equivalence class. */
3451 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3452 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3453 _NL_COLLATE_WEIGHTMB);
3454 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3455 _NL_COLLATE_EXTRAMB);
3456 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3457 _NL_COLLATE_INDIRECTMB);
3458 idx1 = findidx (&cp);
3459 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3460 /* This isn't a valid character. */
3461 return REG_ECOLLATE;
3463 /* Build single byte matcing table for this equivalence class. */
3464 char_buf[1] = (unsigned char) '\0';
3465 len = weights[idx1 & 0xffffff];
3466 for (ch = 0; ch < SBC_MAX; ++ch)
3470 idx2 = findidx (&cp);
3475 /* This isn't a valid character. */
3477 /* Compare only if the length matches and the collation rule
3478 index is the same. */
3479 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3483 while (cnt <= len &&
3484 weights[(idx1 & 0xffffff) + 1 + cnt]
3485 == weights[(idx2 & 0xffffff) + 1 + cnt])
3489 bitset_set (sbcset, ch);
3492 /* Check whether the array has enough space. */
3493 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3495 /* Not enough, realloc it. */
3496 /* +1 in case of mbcset->nequiv_classes is 0. */
3497 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3498 /* Use realloc since the array is NULL if *alloc == 0. */
3499 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3501 new_equiv_class_alloc);
3502 if (BE (new_equiv_classes == NULL, 0))
3504 mbcset->equiv_classes = new_equiv_classes;
3505 *equiv_class_alloc = new_equiv_class_alloc;
3507 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3512 if (BE (strlen ((const char *) name) != 1, 0))
3513 return REG_ECOLLATE;
3514 bitset_set (sbcset, *name);
3519 /* Helper function for parse_bracket_exp.
3520 Build the character class which is represented by NAME.
3521 The result are written to MBCSET and SBCSET.
3522 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3523 is a pointer argument sinse we may update it. */
3525 static reg_errcode_t
3526 #ifdef RE_ENABLE_I18N
3527 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3528 re_charset_t *mbcset, int *char_class_alloc,
3529 const char *class_name, reg_syntax_t syntax)
3530 #else /* not RE_ENABLE_I18N */
3531 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3532 const char *class_name, reg_syntax_t syntax)
3533 #endif /* not RE_ENABLE_I18N */
3537 /* In case of REG_ICASE "upper" and "lower" match the both of
3538 upper and lower cases. */
3539 if ((syntax & RE_ICASE)
3540 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3541 class_name = "alpha";
3543 #ifdef RE_ENABLE_I18N
3544 /* Check the space of the arrays. */
3545 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3547 /* Not enough, realloc it. */
3548 /* +1 in case of mbcset->nchar_classes is 0. */
3549 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3550 /* Use realloc since array is NULL if *alloc == 0. */
3551 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3552 new_char_class_alloc);
3553 if (BE (new_char_classes == NULL, 0))
3555 mbcset->char_classes = new_char_classes;
3556 *char_class_alloc = new_char_class_alloc;
3558 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3559 #endif /* RE_ENABLE_I18N */
3561 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3563 if (BE (trans != NULL, 0)) \
3565 for (i = 0; i < SBC_MAX; ++i) \
3566 if (ctype_func (i)) \
3567 bitset_set (sbcset, trans[i]); \
3571 for (i = 0; i < SBC_MAX; ++i) \
3572 if (ctype_func (i)) \
3573 bitset_set (sbcset, i); \
3577 if (strcmp (class_name, "alnum") == 0)
3578 BUILD_CHARCLASS_LOOP (isalnum);
3579 else if (strcmp (class_name, "cntrl") == 0)
3580 BUILD_CHARCLASS_LOOP (iscntrl);
3581 else if (strcmp (class_name, "lower") == 0)
3582 BUILD_CHARCLASS_LOOP (islower);
3583 else if (strcmp (class_name, "space") == 0)
3584 BUILD_CHARCLASS_LOOP (isspace);
3585 else if (strcmp (class_name, "alpha") == 0)
3586 BUILD_CHARCLASS_LOOP (isalpha);
3587 else if (strcmp (class_name, "digit") == 0)
3588 BUILD_CHARCLASS_LOOP (isdigit);
3589 else if (strcmp (class_name, "print") == 0)
3590 BUILD_CHARCLASS_LOOP (isprint);
3591 else if (strcmp (class_name, "upper") == 0)
3592 BUILD_CHARCLASS_LOOP (isupper);
3593 else if (strcmp (class_name, "blank") == 0)
3595 BUILD_CHARCLASS_LOOP (isblank);
3597 /* see comments above */
3598 BUILD_CHARCLASS_LOOP (is_blank);
3600 else if (strcmp (class_name, "graph") == 0)
3601 BUILD_CHARCLASS_LOOP (isgraph);
3602 else if (strcmp (class_name, "punct") == 0)
3603 BUILD_CHARCLASS_LOOP (ispunct);
3604 else if (strcmp (class_name, "xdigit") == 0)
3605 BUILD_CHARCLASS_LOOP (isxdigit);
3613 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3614 const char *class_name,
3615 const char *extra, int non_match,
3618 re_bitset_ptr_t sbcset;
3619 #ifdef RE_ENABLE_I18N
3620 re_charset_t *mbcset;
3622 #endif /* not RE_ENABLE_I18N */
3624 re_token_t br_token;
3627 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3628 #ifdef RE_ENABLE_I18N
3629 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3630 #endif /* RE_ENABLE_I18N */
3632 #ifdef RE_ENABLE_I18N
3633 if (BE (sbcset == NULL || mbcset == NULL, 0))
3634 #else /* not RE_ENABLE_I18N */
3635 if (BE (sbcset == NULL, 0))
3636 #endif /* not RE_ENABLE_I18N */
3644 #ifdef RE_ENABLE_I18N
3645 mbcset->non_match = 1;
3646 #endif /* not RE_ENABLE_I18N */
3649 /* We don't care the syntax in this case. */
3650 ret = build_charclass (trans, sbcset,
3651 #ifdef RE_ENABLE_I18N
3653 #endif /* RE_ENABLE_I18N */
3656 if (BE (ret != REG_NOERROR, 0))
3659 #ifdef RE_ENABLE_I18N
3660 free_charset (mbcset);
3661 #endif /* RE_ENABLE_I18N */
3665 /* \w match '_' also. */
3666 for (; *extra; extra++)
3667 bitset_set (sbcset, *extra);
3669 /* If it is non-matching list. */
3671 bitset_not (sbcset);
3673 #ifdef RE_ENABLE_I18N
3674 /* Ensure only single byte characters are set. */
3675 if (dfa->mb_cur_max > 1)
3676 bitset_mask (sbcset, dfa->sb_char);
3679 /* Build a tree for simple bracket. */
3680 br_token.type = SIMPLE_BRACKET;
3681 br_token.opr.sbcset = sbcset;
3682 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3683 if (BE (tree == NULL, 0))
3684 goto build_word_op_espace;
3686 #ifdef RE_ENABLE_I18N
3687 if (dfa->mb_cur_max > 1)
3689 bin_tree_t *mbc_tree;
3690 /* Build a tree for complex bracket. */
3691 br_token.type = COMPLEX_BRACKET;
3692 br_token.opr.mbcset = mbcset;
3693 dfa->has_mb_node = 1;
3694 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3695 if (BE (mbc_tree == NULL, 0))
3696 goto build_word_op_espace;
3697 /* Then join them by ALT node. */
3698 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3699 if (BE (mbc_tree != NULL, 1))
3704 free_charset (mbcset);
3707 #else /* not RE_ENABLE_I18N */
3709 #endif /* not RE_ENABLE_I18N */
3711 build_word_op_espace:
3713 #ifdef RE_ENABLE_I18N
3714 free_charset (mbcset);
3715 #endif /* RE_ENABLE_I18N */
3720 /* This is intended for the expressions like "a{1,3}".
3721 Fetch a number from `input', and return the number.
3722 Return -1, if the number field is empty like "{,1}".
3723 Return -2, If an error is occured. */
3726 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3732 fetch_token (token, input, syntax);
3734 if (BE (token->type == END_OF_RE, 0))
3736 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3738 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3739 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3740 num = (num > RE_DUP_MAX) ? -2 : num;
3745 #ifdef RE_ENABLE_I18N
3747 free_charset (re_charset_t *cset)
3749 re_free (cset->mbchars);
3751 re_free (cset->coll_syms);
3752 re_free (cset->equiv_classes);
3753 re_free (cset->range_starts);
3754 re_free (cset->range_ends);
3756 re_free (cset->char_classes);
3759 #endif /* RE_ENABLE_I18N */
3761 /* Functions for binary tree operation. */
3763 /* Create a tree node. */
3766 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3767 re_token_type_t type)
3771 return create_token_tree (dfa, left, right, &t);
3775 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3776 const re_token_t *token)
3779 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3781 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3783 if (storage == NULL)
3785 storage->next = dfa->str_tree_storage;
3786 dfa->str_tree_storage = storage;
3787 dfa->str_tree_storage_idx = 0;
3789 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3791 tree->parent = NULL;
3793 tree->right = right;
3794 tree->token = *token;
3795 tree->token.duplicated = 0;
3796 tree->token.opt_subexp = 0;
3799 tree->node_idx = -1;
3802 left->parent = tree;
3804 right->parent = tree;
3808 /* Mark the tree SRC as an optional subexpression.
3809 To be called from preorder or postorder. */
3811 static reg_errcode_t
3812 mark_opt_subexp (void *extra, bin_tree_t *node)
3814 int idx = (int) (long) extra;
3815 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3816 node->token.opt_subexp = 1;
3821 /* Free the allocated memory inside NODE. */
3824 free_token (re_token_t *node)
3826 #ifdef RE_ENABLE_I18N
3827 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3828 free_charset (node->opr.mbcset);
3830 #endif /* RE_ENABLE_I18N */
3831 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3832 re_free (node->opr.sbcset);
3835 /* Worker function for tree walking. Free the allocated memory inside NODE
3836 and its children. */
3838 static reg_errcode_t
3839 free_tree (void *extra, bin_tree_t *node)
3841 free_token (&node->token);
3846 /* Duplicate the node SRC, and return new node. This is a preorder
3847 visit similar to the one implemented by the generic visitor, but
3848 we need more infrastructure to maintain two parallel trees --- so,
3849 it's easier to duplicate. */
3852 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3854 const bin_tree_t *node;
3855 bin_tree_t *dup_root;
3856 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3858 for (node = root; ; )
3860 /* Create a new tree and link it back to the current parent. */
3861 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3864 (*p_new)->parent = dup_node;
3865 (*p_new)->token.duplicated = 1;
3868 /* Go to the left node, or up and to the right. */
3872 p_new = &dup_node->left;
3876 const bin_tree_t *prev = NULL;
3877 while (node->right == prev || node->right == NULL)
3880 node = node->parent;
3881 dup_node = dup_node->parent;
3886 p_new = &dup_node->right;