git.oblomov.eu Git - git/blob - compat/regex/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #if defined __TANDEM
  21  /* This is currently duplicated from git-compat-utils.h */
  22 # ifdef NO_INTPTR_T
  23  typedef long intptr_t;
  24  typedef unsigned long uintptr_t;
  25 # endif
  26 #endif
  27
  28 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  29                                           size_t length, reg_syntax_t syntax);
  30 static void re_compile_fastmap_iter (regex_t *bufp,
  31                                      const re_dfastate_t *init_state,
  32                                      char *fastmap);
  33 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  34 #ifdef RE_ENABLE_I18N
  35 static void free_charset (re_charset_t *cset);
  36 #endif /* RE_ENABLE_I18N */
  37 static void free_workarea_compile (regex_t *preg);
  38 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  39 #ifdef RE_ENABLE_I18N
  40 static void optimize_utf8 (re_dfa_t *dfa);
  41 #endif
  42 static reg_errcode_t analyze (regex_t *preg);
  43 static reg_errcode_t preorder (bin_tree_t *root,
  44                                reg_errcode_t (fn (void *, bin_tree_t *)),
  45                                void *extra);
  46 static reg_errcode_t postorder (bin_tree_t *root,
  47                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  48                                 void *extra);
  49 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  50 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  51 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  52                                  bin_tree_t *node);
  53 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  54 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  55 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  56 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  57 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  58                                    unsigned int constraint);
  59 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  60 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  61                                          int node, int root);
  62 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  63 static int fetch_number (re_string_t *input, re_token_t *token,
  64                          reg_syntax_t syntax);
  65 static int peek_token (re_token_t *token, re_string_t *input,
  66                         reg_syntax_t syntax) internal_function;
  67 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  68                           reg_syntax_t syntax, reg_errcode_t *err);
  69 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  70                                   re_token_t *token, reg_syntax_t syntax,
  71                                   int nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  73                                  re_token_t *token, reg_syntax_t syntax,
  74                                  int nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  76                                      re_token_t *token, reg_syntax_t syntax,
  77                                      int nest, reg_errcode_t *err);
  78 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  79                                   re_token_t *token, reg_syntax_t syntax,
  80                                   int nest, reg_errcode_t *err);
  81 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  82                                  re_dfa_t *dfa, re_token_t *token,
  83                                  reg_syntax_t syntax, reg_errcode_t *err);
  84 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  85                                       re_token_t *token, reg_syntax_t syntax,
  86                                       reg_errcode_t *err);
  87 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  88                                             re_string_t *regexp,
  89                                             re_token_t *token, int token_len,
  90                                             re_dfa_t *dfa,
  91                                             reg_syntax_t syntax,
  92                                             int accept_hyphen);
  93 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  94                                           re_string_t *regexp,
  95                                           re_token_t *token);
  96 #ifdef RE_ENABLE_I18N
  97 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  98                                         re_charset_t *mbcset,
  99                                         int *equiv_class_alloc,
 100                                         const unsigned char *name);
 101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 102                                       bitset_t sbcset,
 103                                       re_charset_t *mbcset,
 104                                       int *char_class_alloc,
 105                                       const char *class_name,
 106                                       reg_syntax_t syntax);
 107 #else  /* not RE_ENABLE_I18N */
 108 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 109                                         const unsigned char *name);
 110 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 111                                       bitset_t sbcset,
 112                                       const char *class_name,
 113                                       reg_syntax_t syntax);
 114 #endif /* not RE_ENABLE_I18N */
 115 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 116                                        RE_TRANSLATE_TYPE trans,
 117                                        const char *class_name,
 118                                        const char *extra,
 119                                        int non_match, reg_errcode_t *err);
 120 static bin_tree_t *create_tree (re_dfa_t *dfa,
 121                                 bin_tree_t *left, bin_tree_t *right,
 122                                 re_token_type_t type);
 123 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 124                                       bin_tree_t *left, bin_tree_t *right,
 125                                       const re_token_t *token);
 126 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 127 static void free_token (re_token_t *node);
 128 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 129 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 130 \f
 131 /* This table gives an error message for each of the error codes listed
 132    in regex.h.  Obviously the order here has to be same as there.
 133    POSIX doesn't require that we do anything for REG_NOERROR,
 134    but why not be nice?  */
 135
 136 const char __re_error_msgid[] attribute_hidden =
 137   {
 138 #define REG_NOERROR_IDX 0
 139     gettext_noop ("Success")    /* REG_NOERROR */
 140     "\0"
 141 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 142     gettext_noop ("No match")   /* REG_NOMATCH */
 143     "\0"
 144 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 145     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 146     "\0"
 147 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 148     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 149     "\0"
 150 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 151     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 152     "\0"
 153 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 154     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 155     "\0"
 156 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 157     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 158     "\0"
 159 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 160     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 161     "\0"
 162 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 163     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 164     "\0"
 165 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 166     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 167     "\0"
 168 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 169     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 170     "\0"
 171 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 172     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 173     "\0"
 174 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 175     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 176     "\0"
 177 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 178     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 179     "\0"
 180 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 181     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 182     "\0"
 183 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 184     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 185     "\0"
 186 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 187     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 188   };
 189
 190 const size_t __re_error_msgid_idx[] attribute_hidden =
 191   {
 192     REG_NOERROR_IDX,
 193     REG_NOMATCH_IDX,
 194     REG_BADPAT_IDX,
 195     REG_ECOLLATE_IDX,
 196     REG_ECTYPE_IDX,
 197     REG_EESCAPE_IDX,
 198     REG_ESUBREG_IDX,
 199     REG_EBRACK_IDX,
 200     REG_EPAREN_IDX,
 201     REG_EBRACE_IDX,
 202     REG_BADBR_IDX,
 203     REG_ERANGE_IDX,
 204     REG_ESPACE_IDX,
 205     REG_BADRPT_IDX,
 206     REG_EEND_IDX,
 207     REG_ESIZE_IDX,
 208     REG_ERPAREN_IDX
 209   };
 210 \f
 211 /* Entry points for GNU code.  */
 212
 213
 214 #ifdef ZOS_USS
 215
 216 /* For ZOS USS we must define btowc */
 217
 218 wchar_t
 219 btowc (int c)
 220 {
 221    wchar_t wtmp[2];
 222    char tmp[2];
 223
 224    tmp[0] = c;
 225    tmp[1] = 0;
 226
 227    mbtowc (wtmp, tmp, 1);
 228    return wtmp[0];
 229 }
 230 #endif
 231
 232 /* re_compile_pattern is the GNU regular expression compiler: it
 233    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 234    Returns 0 if the pattern was valid, otherwise an error string.
 235
 236    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 237    are set in BUFP on entry.  */
 238
 239 const char *
 240 re_compile_pattern (const char *pattern,
 241                     size_t length,
 242                     struct re_pattern_buffer *bufp)
 243 {
 244   reg_errcode_t ret;
 245
 246   /* And GNU code determines whether or not to get register information
 247      by passing null for the REGS argument to re_match, etc., not by
 248      setting no_sub, unless RE_NO_SUB is set.  */
 249   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 250
 251   /* Match anchors at newline.  */
 252   bufp->newline_anchor = 1;
 253
 254   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 255
 256   if (!ret)
 257     return NULL;
 258   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 259 }
 260 #ifdef _LIBC
 261 weak_alias (__re_compile_pattern, re_compile_pattern)
 262 #endif
 263
 264 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 265    also be assigned to arbitrarily: each pattern buffer stores its own
 266    syntax, so it can be changed between regex compilations.  */
 267 /* This has no initializer because initialized variables in Emacs
 268    become read-only after dumping.  */
 269 reg_syntax_t re_syntax_options;
 270
 271
 272 /* Specify the precise syntax of regexps for compilation.  This provides
 273    for compatibility for various utilities which historically have
 274    different, incompatible syntaxes.
 275
 276    The argument SYNTAX is a bit mask comprised of the various bits
 277    defined in regex.h.  We return the old syntax.  */
 278
 279 reg_syntax_t
 280 re_set_syntax (reg_syntax_t syntax)
 281 {
 282   reg_syntax_t ret = re_syntax_options;
 283
 284   re_syntax_options = syntax;
 285   return ret;
 286 }
 287 #ifdef _LIBC
 288 weak_alias (__re_set_syntax, re_set_syntax)
 289 #endif
 290
 291 int
 292 re_compile_fastmap (struct re_pattern_buffer *bufp)
 293 {
 294   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 295   char *fastmap = bufp->fastmap;
 296
 297   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 298   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 299   if (dfa->init_state != dfa->init_state_word)
 300     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 301   if (dfa->init_state != dfa->init_state_nl)
 302     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 303   if (dfa->init_state != dfa->init_state_begbuf)
 304     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 305   bufp->fastmap_accurate = 1;
 306   return 0;
 307 }
 308 #ifdef _LIBC
 309 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 310 #endif
 311
 312 static inline void
 313 __attribute ((always_inline))
 314 re_set_fastmap (char *fastmap, int icase, int ch)
 315 {
 316   fastmap[ch] = 1;
 317   if (icase)
 318     fastmap[tolower (ch)] = 1;
 319 }
 320
 321 /* Helper function for re_compile_fastmap.
 322    Compile fastmap for the initial_state INIT_STATE.  */
 323
 324 static void
 325 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 326                          char *fastmap)
 327 {
 328   volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 329   int node_cnt;
 330   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 331   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 332     {
 333       int node = init_state->nodes.elems[node_cnt];
 334       re_token_type_t type = dfa->nodes[node].type;
 335
 336       if (type == CHARACTER)
 337         {
 338           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 339 #ifdef RE_ENABLE_I18N
 340           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 341             {
 342               unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 343               wchar_t wc;
 344               mbstate_t state;
 345
 346               p = buf;
 347               *p++ = dfa->nodes[node].opr.c;
 348               while (++node < dfa->nodes_len
 349                      && dfa->nodes[node].type == CHARACTER
 350                      && dfa->nodes[node].mb_partial)
 351                 *p++ = dfa->nodes[node].opr.c;
 352               memset (&state, '\0', sizeof (state));
 353               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 354                              &state) == p - buf
 355                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 356                       != (size_t) -1))
 357                 re_set_fastmap (fastmap, 0, buf[0]);
 358               re_free (buf);
 359             }
 360 #endif
 361         }
 362       else if (type == SIMPLE_BRACKET)
 363         {
 364           int i, ch;
 365           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 366             {
 367               int j;
 368               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 369               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 370                 if (w & ((bitset_word_t) 1 << j))
 371                   re_set_fastmap (fastmap, icase, ch);
 372             }
 373         }
 374 #ifdef RE_ENABLE_I18N
 375       else if (type == COMPLEX_BRACKET)
 376         {
 377           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 378           int i;
 379
 380 # ifdef _LIBC
 381           /* See if we have to try all bytes which start multiple collation
 382              elements.
 383              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 384                   collation element, and don't catch 'b' since 'b' is
 385                   the only collation element which starts from 'b' (and
 386                   it is caught by SIMPLE_BRACKET).  */
 387               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 388                   && (cset->ncoll_syms || cset->nranges))
 389                 {
 390                   const int32_t *table = (const int32_t *)
 391                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 392                   for (i = 0; i < SBC_MAX; ++i)
 393                     if (table[i] < 0)
 394                       re_set_fastmap (fastmap, icase, i);
 395                 }
 396 # endif /* _LIBC */
 397
 398           /* See if we have to start the match at all multibyte characters,
 399              i.e. where we would not find an invalid sequence.  This only
 400              applies to multibyte character sets; for single byte character
 401              sets, the SIMPLE_BRACKET again suffices.  */
 402           if (dfa->mb_cur_max > 1
 403               && (cset->nchar_classes || cset->non_match || cset->nranges
 404 # ifdef _LIBC
 405                   || cset->nequiv_classes
 406 # endif /* _LIBC */
 407                  ))
 408             {
 409               unsigned char c = 0;
 410               do
 411                 {
 412                   mbstate_t mbs;
 413                   memset (&mbs, 0, sizeof (mbs));
 414                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 415                     re_set_fastmap (fastmap, false, (int) c);
 416                 }
 417               while (++c != 0);
 418             }
 419
 420           else
 421             {
 422               /* ... Else catch all bytes which can start the mbchars.  */
 423               for (i = 0; i < cset->nmbchars; ++i)
 424                 {
 425                   char buf[256];
 426                   mbstate_t state;
 427                   memset (&state, '\0', sizeof (state));
 428                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 429                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 430                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 431                     {
 432                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 433                           != (size_t) -1)
 434                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 435                     }
 436                 }
 437             }
 438         }
 439 #endif /* RE_ENABLE_I18N */
 440       else if (type == OP_PERIOD
 441 #ifdef RE_ENABLE_I18N
 442                || type == OP_UTF8_PERIOD
 443 #endif /* RE_ENABLE_I18N */
 444                || type == END_OF_RE)
 445         {
 446           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 447           if (type == END_OF_RE)
 448             bufp->can_be_null = 1;
 449           return;
 450         }
 451     }
 452 }
 453 \f
 454 /* Entry point for POSIX code.  */
 455 /* regcomp takes a regular expression as a string and compiles it.
 456
 457    PREG is a regex_t *.  We do not expect any fields to be initialized,
 458    since POSIX says we shouldn't.  Thus, we set
 459
 460      `buffer' to the compiled pattern;
 461      `used' to the length of the compiled pattern;
 462      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 463        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 464        RE_SYNTAX_POSIX_BASIC;
 465      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 466      `fastmap' to an allocated space for the fastmap;
 467      `fastmap_accurate' to zero;
 468      `re_nsub' to the number of subexpressions in PATTERN.
 469
 470    PATTERN is the address of the pattern string.
 471
 472    CFLAGS is a series of bits which affect compilation.
 473
 474      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 475      use POSIX basic syntax.
 476
 477      If REG_NEWLINE is set, then . and [^...] don't match newline.
 478      Also, regexec will try a match beginning after every newline.
 479
 480      If REG_ICASE is set, then we considers upper- and lowercase
 481      versions of letters to be equivalent when matching.
 482
 483      If REG_NOSUB is set, then when PREG is passed to regexec, that
 484      routine will report only success or failure, and nothing about the
 485      registers.
 486
 487    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 488    the return codes and their meanings.)  */
 489
 490 int
 491 regcomp (regex_t *__restrict preg,
 492          const char *__restrict pattern,
 493          int cflags)
 494 {
 495   reg_errcode_t ret;
 496   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 497                          : RE_SYNTAX_POSIX_BASIC);
 498
 499   preg->buffer = NULL;
 500   preg->allocated = 0;
 501   preg->used = 0;
 502
 503   /* Try to allocate space for the fastmap.  */
 504   preg->fastmap = re_malloc (char, SBC_MAX);
 505   if (BE (preg->fastmap == NULL, 0))
 506     return REG_ESPACE;
 507
 508   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 509
 510   /* If REG_NEWLINE is set, newlines are treated differently.  */
 511   if (cflags & REG_NEWLINE)
 512     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 513       syntax &= ~RE_DOT_NEWLINE;
 514       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 515       /* It also changes the matching behavior.  */
 516       preg->newline_anchor = 1;
 517     }
 518   else
 519     preg->newline_anchor = 0;
 520   preg->no_sub = !!(cflags & REG_NOSUB);
 521   preg->translate = NULL;
 522
 523   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 524
 525   /* POSIX doesn't distinguish between an unmatched open-group and an
 526      unmatched close-group: both are REG_EPAREN.  */
 527   if (ret == REG_ERPAREN)
 528     ret = REG_EPAREN;
 529
 530   /* We have already checked preg->fastmap != NULL.  */
 531   if (BE (ret == REG_NOERROR, 1))
 532     /* Compute the fastmap now, since regexec cannot modify the pattern
 533        buffer.  This function never fails in this implementation.  */
 534     (void) re_compile_fastmap (preg);
 535   else
 536     {
 537       /* Some error occurred while compiling the expression.  */
 538       re_free (preg->fastmap);
 539       preg->fastmap = NULL;
 540     }
 541
 542   return (int) ret;
 543 }
 544 #ifdef _LIBC
 545 weak_alias (__regcomp, regcomp)
 546 #endif
 547
 548 /* Returns a message corresponding to an error code, ERRCODE, returned
 549    from either regcomp or regexec.   We don't use PREG here.  */
 550
 551 size_t
 552 regerror(int errcode, const regex_t *__restrict preg,
 553          char *__restrict errbuf, size_t errbuf_size)
 554 {
 555   const char *msg;
 556   size_t msg_size;
 557
 558   if (BE (errcode < 0
 559           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 560                                / sizeof (__re_error_msgid_idx[0])), 0))
 561     /* Only error codes returned by the rest of the code should be passed
 562        to this routine.  If we are given anything else, or if other regex
 563        code generates an invalid error code, then the program has a bug.
 564        Dump core so we can fix it.  */
 565     abort ();
 566
 567   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 568
 569   msg_size = strlen (msg) + 1; /* Includes the null.  */
 570
 571   if (BE (errbuf_size != 0, 1))
 572     {
 573       if (BE (msg_size > errbuf_size, 0))
 574         {
 575           memcpy (errbuf, msg, errbuf_size - 1);
 576           errbuf[errbuf_size - 1] = 0;
 577         }
 578       else
 579         memcpy (errbuf, msg, msg_size);
 580     }
 581
 582   return msg_size;
 583 }
 584 #ifdef _LIBC
 585 weak_alias (__regerror, regerror)
 586 #endif
 587
 588
 589 #ifdef RE_ENABLE_I18N
 590 /* This static array is used for the map to single-byte characters when
 591    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 592    it the same all the time.  UTF-8 is the preferred encoding so this is
 593    a worthwhile optimization.  */
 594 #if __GNUC__ >= 3
 595 static const bitset_t utf8_sb_map = {
 596   /* Set the first 128 bits.  */
 597   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 598 };
 599 #else /* ! (__GNUC__ >= 3) */
 600 static bitset_t utf8_sb_map;
 601 #endif /* __GNUC__ >= 3 */
 602 #endif /* RE_ENABLE_I18N */
 603
 604
 605 static void
 606 free_dfa_content (re_dfa_t *dfa)
 607 {
 608   int i, j;
 609
 610   if (dfa->nodes)
 611     for (i = 0; i < dfa->nodes_len; ++i)
 612       free_token (dfa->nodes + i);
 613   re_free (dfa->nexts);
 614   for (i = 0; i < dfa->nodes_len; ++i)
 615     {
 616       if (dfa->eclosures != NULL)
 617         re_node_set_free (dfa->eclosures + i);
 618       if (dfa->inveclosures != NULL)
 619         re_node_set_free (dfa->inveclosures + i);
 620       if (dfa->edests != NULL)
 621         re_node_set_free (dfa->edests + i);
 622     }
 623   re_free (dfa->edests);
 624   re_free (dfa->eclosures);
 625   re_free (dfa->inveclosures);
 626   re_free (dfa->nodes);
 627
 628   if (dfa->state_table)
 629     for (i = 0; i <= dfa->state_hash_mask; ++i)
 630       {
 631         struct re_state_table_entry *entry = dfa->state_table + i;
 632         for (j = 0; j < entry->num; ++j)
 633           {
 634             re_dfastate_t *state = entry->array[j];
 635             free_state (state);
 636           }
 637         re_free (entry->array);
 638       }
 639   re_free (dfa->state_table);
 640 #ifdef RE_ENABLE_I18N
 641   if (dfa->sb_char != utf8_sb_map)
 642     re_free (dfa->sb_char);
 643 #endif
 644   re_free (dfa->subexp_map);
 645 #ifdef DEBUG
 646   re_free (dfa->re_str);
 647 #endif
 648
 649   re_free (dfa);
 650 }
 651
 652
 653 /* Free dynamically allocated space used by PREG.  */
 654
 655 void
 656 regfree (regex_t *preg)
 657 {
 658   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 659   if (BE (dfa != NULL, 1))
 660     free_dfa_content (dfa);
 661   preg->buffer = NULL;
 662   preg->allocated = 0;
 663
 664   re_free (preg->fastmap);
 665   preg->fastmap = NULL;
 666
 667   re_free (preg->translate);
 668   preg->translate = NULL;
 669 }
 670 #ifdef _LIBC
 671 weak_alias (__regfree, regfree)
 672 #endif
 673 \f
 674 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 675    them unless specifically requested.  */
 676
 677 #if defined _REGEX_RE_COMP || defined _LIBC
 678
 679 /* BSD has one and only one pattern buffer.  */
 680 static struct re_pattern_buffer re_comp_buf;
 681
 682 char *
 683 # ifdef _LIBC
 684 /* Make these definitions weak in libc, so POSIX programs can redefine
 685    these names if they don't use our functions, and still use
 686    regcomp/regexec above without link errors.  */
 687 weak_function
 688 # endif
 689 re_comp (s)
 690      const char *s;
 691 {
 692   reg_errcode_t ret;
 693   char *fastmap;
 694
 695   if (!s)
 696     {
 697       if (!re_comp_buf.buffer)
 698         return gettext ("No previous regular expression");
 699       return 0;
 700     }
 701
 702   if (re_comp_buf.buffer)
 703     {
 704       fastmap = re_comp_buf.fastmap;
 705       re_comp_buf.fastmap = NULL;
 706       __regfree (&re_comp_buf);
 707       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 708       re_comp_buf.fastmap = fastmap;
 709     }
 710
 711   if (re_comp_buf.fastmap == NULL)
 712     {
 713       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 714       if (re_comp_buf.fastmap == NULL)
 715         return (char *) gettext (__re_error_msgid
 716                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 717     }
 718
 719   /* Since `re_exec' always passes NULL for the `regs' argument, we
 720      don't need to initialize the pattern buffer fields which affect it.  */
 721
 722   /* Match anchors at newlines.  */
 723   re_comp_buf.newline_anchor = 1;
 724
 725   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 726
 727   if (!ret)
 728     return NULL;
 729
 730   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 731   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 732 }
 733
 734 #ifdef _LIBC
 735 libc_freeres_fn (free_mem)
 736 {
 737   __regfree (&re_comp_buf);
 738 }
 739 #endif
 740
 741 #endif /* _REGEX_RE_COMP */
 742 \f
 743 /* Internal entry point.
 744    Compile the regular expression PATTERN, whose length is LENGTH.
 745    SYNTAX indicate regular expression's syntax.  */
 746
 747 static reg_errcode_t
 748 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 749                      reg_syntax_t syntax)
 750 {
 751   reg_errcode_t err = REG_NOERROR;
 752   re_dfa_t *dfa;
 753   re_string_t regexp;
 754
 755   /* Initialize the pattern buffer.  */
 756   preg->fastmap_accurate = 0;
 757   preg->syntax = syntax;
 758   preg->not_bol = preg->not_eol = 0;
 759   preg->used = 0;
 760   preg->re_nsub = 0;
 761   preg->can_be_null = 0;
 762   preg->regs_allocated = REGS_UNALLOCATED;
 763
 764   /* Initialize the dfa.  */
 765   dfa = (re_dfa_t *) preg->buffer;
 766   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 767     {
 768       /* If zero allocated, but buffer is non-null, try to realloc
 769          enough space.  This loses if buffer's address is bogus, but
 770          that is the user's responsibility.  If ->buffer is NULL this
 771          is a simple allocation.  */
 772       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 773       if (dfa == NULL)
 774         return REG_ESPACE;
 775       preg->allocated = sizeof (re_dfa_t);
 776       preg->buffer = (unsigned char *) dfa;
 777     }
 778   preg->used = sizeof (re_dfa_t);
 779
 780   err = init_dfa (dfa, length);
 781   if (BE (err != REG_NOERROR, 0))
 782     {
 783       free_dfa_content (dfa);
 784       preg->buffer = NULL;
 785       preg->allocated = 0;
 786       return err;
 787     }
 788 #ifdef DEBUG
 789   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 790   dfa->re_str = re_malloc (char, length + 1);
 791   strncpy (dfa->re_str, pattern, length + 1);
 792 #endif
 793
 794   __libc_lock_init (dfa->lock);
 795
 796   err = re_string_construct (&regexp, pattern, length, preg->translate,
 797                              syntax & RE_ICASE, dfa);
 798   if (BE (err != REG_NOERROR, 0))
 799     {
 800     re_compile_internal_free_return:
 801       free_workarea_compile (preg);
 802       re_string_destruct (&regexp);
 803       free_dfa_content (dfa);
 804       preg->buffer = NULL;
 805       preg->allocated = 0;
 806       return err;
 807     }
 808
 809   /* Parse the regular expression, and build a structure tree.  */
 810   preg->re_nsub = 0;
 811   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 812   if (BE (dfa->str_tree == NULL, 0))
 813     goto re_compile_internal_free_return;
 814
 815   /* Analyze the tree and create the nfa.  */
 816   err = analyze (preg);
 817   if (BE (err != REG_NOERROR, 0))
 818     goto re_compile_internal_free_return;
 819
 820 #ifdef RE_ENABLE_I18N
 821   /* If possible, do searching in single byte encoding to speed things up.  */
 822   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 823     optimize_utf8 (dfa);
 824 #endif
 825
 826   /* Then create the initial state of the dfa.  */
 827   err = create_initial_state (dfa);
 828
 829   /* Release work areas.  */
 830   free_workarea_compile (preg);
 831   re_string_destruct (&regexp);
 832
 833   if (BE (err != REG_NOERROR, 0))
 834     {
 835       free_dfa_content (dfa);
 836       preg->buffer = NULL;
 837       preg->allocated = 0;
 838     }
 839
 840   return err;
 841 }
 842
 843 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 844    as the initial length of some arrays.  */
 845
 846 static reg_errcode_t
 847 init_dfa (re_dfa_t *dfa, size_t pat_len)
 848 {
 849   unsigned int table_size;
 850 #ifndef _LIBC
 851   char *codeset_name;
 852 #endif
 853
 854   memset (dfa, '\0', sizeof (re_dfa_t));
 855
 856   /* Force allocation of str_tree_storage the first time.  */
 857   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 858
 859   /* Avoid overflows.  */
 860   if (pat_len == SIZE_MAX)
 861     return REG_ESPACE;
 862
 863   dfa->nodes_alloc = pat_len + 1;
 864   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 865
 866   /*  table_size = 2 ^ ceil(log pat_len) */
 867   for (table_size = 1; ; table_size <<= 1)
 868     if (table_size > pat_len)
 869       break;
 870
 871   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 872   dfa->state_hash_mask = table_size - 1;
 873
 874   dfa->mb_cur_max = MB_CUR_MAX;
 875 #ifdef _LIBC
 876   if (dfa->mb_cur_max == 6
 877       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 878     dfa->is_utf8 = 1;
 879   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 880                        != 0);
 881 #else
 882 # ifdef HAVE_LANGINFO_CODESET
 883   codeset_name = nl_langinfo (CODESET);
 884 # else
 885   codeset_name = getenv ("LC_ALL");
 886   if (codeset_name == NULL || codeset_name[0] == '\0')
 887     codeset_name = getenv ("LC_CTYPE");
 888   if (codeset_name == NULL || codeset_name[0] == '\0')
 889     codeset_name = getenv ("LANG");
 890   if (codeset_name == NULL)
 891     codeset_name = "";
 892   else if (strchr (codeset_name, '.') !=  NULL)
 893     codeset_name = strchr (codeset_name, '.') + 1;
 894 # endif
 895
 896   /* strcasecmp isn't a standard interface. brute force check */
 897 #if 0
 898   if (strcasecmp (codeset_name, "UTF-8") == 0
 899       || strcasecmp (codeset_name, "UTF8") == 0)
 900     dfa->is_utf8 = 1;
 901 #else
 902   if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 903       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 904       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 905       && (codeset_name[3] == '-'
 906           ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 907           : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 908     dfa->is_utf8 = 1;
 909 #endif
 910
 911   /* We check exhaustively in the loop below if this charset is a
 912      superset of ASCII.  */
 913   dfa->map_notascii = 0;
 914 #endif
 915
 916 #ifdef RE_ENABLE_I18N
 917   if (dfa->mb_cur_max > 1)
 918     {
 919       if (dfa->is_utf8)
 920         {
 921 #if !defined(__GNUC__) || __GNUC__ < 3
 922           static short utf8_sb_map_inited = 0;
 923
 924           if (! utf8_sb_map_inited)
 925             {
 926                 int i;
 927
 928                 utf8_sb_map_inited = 0;
 929                 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 930                   utf8_sb_map[i] = BITSET_WORD_MAX;
 931             }
 932 #endif
 933           dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 934         }
 935       else
 936         {
 937           int i, j, ch;
 938
 939           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 940           if (BE (dfa->sb_char == NULL, 0))
 941             return REG_ESPACE;
 942
 943           /* Set the bits corresponding to single byte chars.  */
 944           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 945             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 946               {
 947                 wint_t wch = __btowc (ch);
 948                 if (wch != WEOF)
 949                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 950 # ifndef _LIBC
 951                 if (isascii (ch) && wch != ch)
 952                   dfa->map_notascii = 1;
 953 # endif
 954               }
 955         }
 956     }
 957 #endif
 958
 959   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 960     return REG_ESPACE;
 961   return REG_NOERROR;
 962 }
 963
 964 /* Initialize WORD_CHAR table, which indicate which character is
 965    "word".  In this case "word" means that it is the word construction
 966    character used by some operators like "\<", "\>", etc.  */
 967
 968 static void
 969 internal_function
 970 init_word_char (re_dfa_t *dfa)
 971 {
 972   int i, j, ch;
 973   dfa->word_ops_used = 1;
 974   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 975     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 976       if (isalnum (ch) || ch == '_')
 977         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 978 }
 979
 980 /* Free the work area which are only used while compiling.  */
 981
 982 static void
 983 free_workarea_compile (regex_t *preg)
 984 {
 985   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 986   bin_tree_storage_t *storage, *next;
 987   for (storage = dfa->str_tree_storage; storage; storage = next)
 988     {
 989       next = storage->next;
 990       re_free (storage);
 991     }
 992   dfa->str_tree_storage = NULL;
 993   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 994   dfa->str_tree = NULL;
 995   re_free (dfa->org_indices);
 996   dfa->org_indices = NULL;
 997 }
 998
 999 /* Create initial states for all contexts.  */
1000
1001 static reg_errcode_t
1002 create_initial_state (re_dfa_t *dfa)
1003 {
1004   int first, i;
1005   reg_errcode_t err;
1006   re_node_set init_nodes;
1007
1008   /* Initial states have the epsilon closure of the node which is
1009      the first node of the regular expression.  */
1010   first = dfa->str_tree->first->node_idx;
1011   dfa->init_node = first;
1012   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1013   if (BE (err != REG_NOERROR, 0))
1014     return err;
1015
1016   /* The back-references which are in initial states can epsilon transit,
1017      since in this case all of the subexpressions can be null.
1018      Then we add epsilon closures of the nodes which are the next nodes of
1019      the back-references.  */
1020   if (dfa->nbackref > 0)
1021     for (i = 0; i < init_nodes.nelem; ++i)
1022       {
1023         int node_idx = init_nodes.elems[i];
1024         re_token_type_t type = dfa->nodes[node_idx].type;
1025
1026         int clexp_idx;
1027         if (type != OP_BACK_REF)
1028           continue;
1029         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1030           {
1031             re_token_t *clexp_node;
1032             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1033             if (clexp_node->type == OP_CLOSE_SUBEXP
1034                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1035               break;
1036           }
1037         if (clexp_idx == init_nodes.nelem)
1038           continue;
1039
1040         if (type == OP_BACK_REF)
1041           {
1042             int dest_idx = dfa->edests[node_idx].elems[0];
1043             if (!re_node_set_contains (&init_nodes, dest_idx))
1044               {
1045                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1046                                                        dfa->eclosures
1047                                                        + dest_idx);
1048                 if (err != REG_NOERROR)
1049                   return err;
1050                 i = 0;
1051               }
1052           }
1053       }
1054
1055   /* It must be the first time to invoke acquire_state.  */
1056   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1057   /* We don't check ERR here, since the initial state must not be NULL.  */
1058   if (BE (dfa->init_state == NULL, 0))
1059     return err;
1060   if (dfa->init_state->has_constraint)
1061     {
1062       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1063                                                        CONTEXT_WORD);
1064       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1065                                                      CONTEXT_NEWLINE);
1066       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1067                                                          &init_nodes,
1068                                                          CONTEXT_NEWLINE
1069                                                          | CONTEXT_BEGBUF);
1070       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1071               || dfa->init_state_begbuf == NULL, 0))
1072         return err;
1073     }
1074   else
1075     dfa->init_state_word = dfa->init_state_nl
1076       = dfa->init_state_begbuf = dfa->init_state;
1077
1078   re_node_set_free (&init_nodes);
1079   return REG_NOERROR;
1080 }
1081 \f
1082 #ifdef RE_ENABLE_I18N
1083 /* If it is possible to do searching in single byte encoding instead of UTF-8
1084    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1085    DFA nodes where needed.  */
1086
1087 static void
1088 optimize_utf8 (re_dfa_t *dfa)
1089 {
1090   int node, i, mb_chars = 0, has_period = 0;
1091
1092   for (node = 0; node < dfa->nodes_len; ++node)
1093     switch (dfa->nodes[node].type)
1094       {
1095       case CHARACTER:
1096         if (dfa->nodes[node].opr.c >= 0x80)
1097           mb_chars = 1;
1098         break;
1099       case ANCHOR:
1100         switch (dfa->nodes[node].opr.ctx_type)
1101           {
1102           case LINE_FIRST:
1103           case LINE_LAST:
1104           case BUF_FIRST:
1105           case BUF_LAST:
1106             break;
1107           default:
1108             /* Word anchors etc. cannot be handled.  It's okay to test
1109                opr.ctx_type since constraints (for all DFA nodes) are
1110                created by ORing one or more opr.ctx_type values.  */
1111             return;
1112           }
1113         break;
1114       case OP_PERIOD:
1115         has_period = 1;
1116         break;
1117       case OP_BACK_REF:
1118       case OP_ALT:
1119       case END_OF_RE:
1120       case OP_DUP_ASTERISK:
1121       case OP_OPEN_SUBEXP:
1122       case OP_CLOSE_SUBEXP:
1123         break;
1124       case COMPLEX_BRACKET:
1125         return;
1126       case SIMPLE_BRACKET:
1127         /* Just double check.  The non-ASCII range starts at 0x80.  */
1128         assert (0x80 % BITSET_WORD_BITS == 0);
1129         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1130           if (dfa->nodes[node].opr.sbcset[i])
1131             return;
1132         break;
1133       default:
1134         abort ();
1135       }
1136
1137   if (mb_chars || has_period)
1138     for (node = 0; node < dfa->nodes_len; ++node)
1139       {
1140         if (dfa->nodes[node].type == CHARACTER
1141             && dfa->nodes[node].opr.c >= 0x80)
1142           dfa->nodes[node].mb_partial = 0;
1143         else if (dfa->nodes[node].type == OP_PERIOD)
1144           dfa->nodes[node].type = OP_UTF8_PERIOD;
1145       }
1146
1147   /* The search can be in single byte locale.  */
1148   dfa->mb_cur_max = 1;
1149   dfa->is_utf8 = 0;
1150   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1151 }
1152 #endif
1153 \f
1154 /* Analyze the structure tree, and calculate "first", "next", "edest",
1155    "eclosure", and "inveclosure".  */
1156
1157 static reg_errcode_t
1158 analyze (regex_t *preg)
1159 {
1160   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1161   reg_errcode_t ret;
1162
1163   /* Allocate arrays.  */
1164   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1165   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1166   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1167   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1168   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1169           || dfa->eclosures == NULL, 0))
1170     return REG_ESPACE;
1171
1172   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1173   if (dfa->subexp_map != NULL)
1174     {
1175       int i;
1176       for (i = 0; i < preg->re_nsub; i++)
1177         dfa->subexp_map[i] = i;
1178       preorder (dfa->str_tree, optimize_subexps, dfa);
1179       for (i = 0; i < preg->re_nsub; i++)
1180         if (dfa->subexp_map[i] != i)
1181           break;
1182       if (i == preg->re_nsub)
1183         {
1184           free (dfa->subexp_map);
1185           dfa->subexp_map = NULL;
1186         }
1187     }
1188
1189   ret = postorder (dfa->str_tree, lower_subexps, preg);
1190   if (BE (ret != REG_NOERROR, 0))
1191     return ret;
1192   ret = postorder (dfa->str_tree, calc_first, dfa);
1193   if (BE (ret != REG_NOERROR, 0))
1194     return ret;
1195   preorder (dfa->str_tree, calc_next, dfa);
1196   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1197   if (BE (ret != REG_NOERROR, 0))
1198     return ret;
1199   ret = calc_eclosure (dfa);
1200   if (BE (ret != REG_NOERROR, 0))
1201     return ret;
1202
1203   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1204      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1205   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1206       || dfa->nbackref)
1207     {
1208       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1209       if (BE (dfa->inveclosures == NULL, 0))
1210         return REG_ESPACE;
1211       ret = calc_inveclosure (dfa);
1212     }
1213
1214   return ret;
1215 }
1216
1217 /* Our parse trees are very unbalanced, so we cannot use a stack to
1218    implement parse tree visits.  Instead, we use parent pointers and
1219    some hairy code in these two functions.  */
1220 static reg_errcode_t
1221 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1222            void *extra)
1223 {
1224   bin_tree_t *node, *prev;
1225
1226   for (node = root; ; )
1227     {
1228       /* Descend down the tree, preferably to the left (or to the right
1229          if that's the only child).  */
1230       while (node->left || node->right)
1231         if (node->left)
1232           node = node->left;
1233         else
1234           node = node->right;
1235
1236       do
1237         {
1238           reg_errcode_t err = fn (extra, node);
1239           if (BE (err != REG_NOERROR, 0))
1240             return err;
1241           if (node->parent == NULL)
1242             return REG_NOERROR;
1243           prev = node;
1244           node = node->parent;
1245         }
1246       /* Go up while we have a node that is reached from the right.  */
1247       while (node->right == prev || node->right == NULL);
1248       node = node->right;
1249     }
1250 }
1251
1252 static reg_errcode_t
1253 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254           void *extra)
1255 {
1256   bin_tree_t *node;
1257
1258   for (node = root; ; )
1259     {
1260       reg_errcode_t err = fn (extra, node);
1261       if (BE (err != REG_NOERROR, 0))
1262         return err;
1263
1264       /* Go to the left node, or up and to the right.  */
1265       if (node->left)
1266         node = node->left;
1267       else
1268         {
1269           bin_tree_t *prev = NULL;
1270           while (node->right == prev || node->right == NULL)
1271             {
1272               prev = node;
1273               node = node->parent;
1274               if (!node)
1275                 return REG_NOERROR;
1276             }
1277           node = node->right;
1278         }
1279     }
1280 }
1281
1282 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1283    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1284    backreferences as well.  Requires a preorder visit.  */
1285 static reg_errcode_t
1286 optimize_subexps (void *extra, bin_tree_t *node)
1287 {
1288   re_dfa_t *dfa = (re_dfa_t *) extra;
1289
1290   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1291     {
1292       int idx = node->token.opr.idx;
1293       node->token.opr.idx = dfa->subexp_map[idx];
1294       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1295     }
1296
1297   else if (node->token.type == SUBEXP
1298            && node->left && node->left->token.type == SUBEXP)
1299     {
1300       int other_idx = node->left->token.opr.idx;
1301
1302       node->left = node->left->left;
1303       if (node->left)
1304         node->left->parent = node;
1305
1306       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1307       if (other_idx < BITSET_WORD_BITS)
1308           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1309     }
1310
1311   return REG_NOERROR;
1312 }
1313
1314 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1315    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1316 static reg_errcode_t
1317 lower_subexps (void *extra, bin_tree_t *node)
1318 {
1319   regex_t *preg = (regex_t *) extra;
1320   reg_errcode_t err = REG_NOERROR;
1321
1322   if (node->left && node->left->token.type == SUBEXP)
1323     {
1324       node->left = lower_subexp (&err, preg, node->left);
1325       if (node->left)
1326         node->left->parent = node;
1327     }
1328   if (node->right && node->right->token.type == SUBEXP)
1329     {
1330       node->right = lower_subexp (&err, preg, node->right);
1331       if (node->right)
1332         node->right->parent = node;
1333     }
1334
1335   return err;
1336 }
1337
1338 static bin_tree_t *
1339 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1340 {
1341   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1342   bin_tree_t *body = node->left;
1343   bin_tree_t *op, *cls, *tree1, *tree;
1344
1345   if (preg->no_sub
1346       /* We do not optimize empty subexpressions, because otherwise we may
1347          have bad CONCAT nodes with NULL children.  This is obviously not
1348          very common, so we do not lose much.  An example that triggers
1349          this case is the sed "script" /\(\)/x.  */
1350       && node->left != NULL
1351       && (node->token.opr.idx >= BITSET_WORD_BITS
1352           || !(dfa->used_bkref_map
1353                & ((bitset_word_t) 1 << node->token.opr.idx))))
1354     return node->left;
1355
1356   /* Convert the SUBEXP node to the concatenation of an
1357      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1358   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1359   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1360   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1361   tree = create_tree (dfa, op, tree1, CONCAT);
1362   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1363     {
1364       *err = REG_ESPACE;
1365       return NULL;
1366     }
1367
1368   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1369   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1370   return tree;
1371 }
1372
1373 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1374    nodes.  Requires a postorder visit.  */
1375 static reg_errcode_t
1376 calc_first (void *extra, bin_tree_t *node)
1377 {
1378   re_dfa_t *dfa = (re_dfa_t *) extra;
1379   if (node->token.type == CONCAT)
1380     {
1381       node->first = node->left->first;
1382       node->node_idx = node->left->node_idx;
1383     }
1384   else
1385     {
1386       node->first = node;
1387       node->node_idx = re_dfa_add_node (dfa, node->token);
1388       if (BE (node->node_idx == -1, 0))
1389         return REG_ESPACE;
1390       if (node->token.type == ANCHOR)
1391         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1392     }
1393   return REG_NOERROR;
1394 }
1395
1396 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1397 static reg_errcode_t
1398 calc_next (void *extra, bin_tree_t *node)
1399 {
1400   switch (node->token.type)
1401     {
1402     case OP_DUP_ASTERISK:
1403       node->left->next = node;
1404       break;
1405     case CONCAT:
1406       node->left->next = node->right->first;
1407       node->right->next = node->next;
1408       break;
1409     default:
1410       if (node->left)
1411         node->left->next = node->next;
1412       if (node->right)
1413         node->right->next = node->next;
1414       break;
1415     }
1416   return REG_NOERROR;
1417 }
1418
1419 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1420 static reg_errcode_t
1421 link_nfa_nodes (void *extra, bin_tree_t *node)
1422 {
1423   re_dfa_t *dfa = (re_dfa_t *) extra;
1424   int idx = node->node_idx;
1425   reg_errcode_t err = REG_NOERROR;
1426
1427   switch (node->token.type)
1428     {
1429     case CONCAT:
1430       break;
1431
1432     case END_OF_RE:
1433       assert (node->next == NULL);
1434       break;
1435
1436     case OP_DUP_ASTERISK:
1437     case OP_ALT:
1438       {
1439         int left, right;
1440         dfa->has_plural_match = 1;
1441         if (node->left != NULL)
1442           left = node->left->first->node_idx;
1443         else
1444           left = node->next->node_idx;
1445         if (node->right != NULL)
1446           right = node->right->first->node_idx;
1447         else
1448           right = node->next->node_idx;
1449         assert (left > -1);
1450         assert (right > -1);
1451         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1452       }
1453       break;
1454
1455     case ANCHOR:
1456     case OP_OPEN_SUBEXP:
1457     case OP_CLOSE_SUBEXP:
1458       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1459       break;
1460
1461     case OP_BACK_REF:
1462       dfa->nexts[idx] = node->next->node_idx;
1463       if (node->token.type == OP_BACK_REF)
1464         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1465       break;
1466
1467     default:
1468       assert (!IS_EPSILON_NODE (node->token.type));
1469       dfa->nexts[idx] = node->next->node_idx;
1470       break;
1471     }
1472
1473   return err;
1474 }
1475
1476 /* Duplicate the epsilon closure of the node ROOT_NODE.
1477    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1478    to their own constraint.  */
1479
1480 static reg_errcode_t
1481 internal_function
1482 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1483                         int root_node, unsigned int init_constraint)
1484 {
1485   int org_node, clone_node, ret;
1486   unsigned int constraint = init_constraint;
1487   for (org_node = top_org_node, clone_node = top_clone_node;;)
1488     {
1489       int org_dest, clone_dest;
1490       if (dfa->nodes[org_node].type == OP_BACK_REF)
1491         {
1492           /* If the back reference epsilon-transit, its destination must
1493              also have the constraint.  Then duplicate the epsilon closure
1494              of the destination of the back reference, and store it in
1495              edests of the back reference.  */
1496           org_dest = dfa->nexts[org_node];
1497           re_node_set_empty (dfa->edests + clone_node);
1498           clone_dest = duplicate_node (dfa, org_dest, constraint);
1499           if (BE (clone_dest == -1, 0))
1500             return REG_ESPACE;
1501           dfa->nexts[clone_node] = dfa->nexts[org_node];
1502           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503           if (BE (ret < 0, 0))
1504             return REG_ESPACE;
1505         }
1506       else if (dfa->edests[org_node].nelem == 0)
1507         {
1508           /* In case of the node can't epsilon-transit, don't duplicate the
1509              destination and store the original destination as the
1510              destination of the node.  */
1511           dfa->nexts[clone_node] = dfa->nexts[org_node];
1512           break;
1513         }
1514       else if (dfa->edests[org_node].nelem == 1)
1515         {
1516           /* In case of the node can epsilon-transit, and it has only one
1517              destination.  */
1518           org_dest = dfa->edests[org_node].elems[0];
1519           re_node_set_empty (dfa->edests + clone_node);
1520           /* If the node is root_node itself, it means the epsilon clsoure
1521              has a loop.   Then tie it to the destination of the root_node.  */
1522           if (org_node == root_node && clone_node != org_node)
1523             {
1524               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1525               if (BE (ret < 0, 0))
1526                 return REG_ESPACE;
1527               break;
1528             }
1529           /* In case of the node has another constraint, add it.  */
1530           constraint |= dfa->nodes[org_node].constraint;
1531           clone_dest = duplicate_node (dfa, org_dest, constraint);
1532           if (BE (clone_dest == -1, 0))
1533             return REG_ESPACE;
1534           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535           if (BE (ret < 0, 0))
1536             return REG_ESPACE;
1537         }
1538       else /* dfa->edests[org_node].nelem == 2 */
1539         {
1540           /* In case of the node can epsilon-transit, and it has two
1541              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1542           org_dest = dfa->edests[org_node].elems[0];
1543           re_node_set_empty (dfa->edests + clone_node);
1544           /* Search for a duplicated node which satisfies the constraint.  */
1545           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1546           if (clone_dest == -1)
1547             {
1548               /* There is no such duplicated node, create a new one.  */
1549               reg_errcode_t err;
1550               clone_dest = duplicate_node (dfa, org_dest, constraint);
1551               if (BE (clone_dest == -1, 0))
1552                 return REG_ESPACE;
1553               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554               if (BE (ret < 0, 0))
1555                 return REG_ESPACE;
1556               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1557                                             root_node, constraint);
1558               if (BE (err != REG_NOERROR, 0))
1559                 return err;
1560             }
1561           else
1562             {
1563               /* There is a duplicated node which satisfies the constraint,
1564                  use it to avoid infinite loop.  */
1565               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566               if (BE (ret < 0, 0))
1567                 return REG_ESPACE;
1568             }
1569
1570           org_dest = dfa->edests[org_node].elems[1];
1571           clone_dest = duplicate_node (dfa, org_dest, constraint);
1572           if (BE (clone_dest == -1, 0))
1573             return REG_ESPACE;
1574           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1575           if (BE (ret < 0, 0))
1576             return REG_ESPACE;
1577         }
1578       org_node = org_dest;
1579       clone_node = clone_dest;
1580     }
1581   return REG_NOERROR;
1582 }
1583
1584 /* Search for a node which is duplicated from the node ORG_NODE, and
1585    satisfies the constraint CONSTRAINT.  */
1586
1587 static int
1588 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1589                         unsigned int constraint)
1590 {
1591   int idx;
1592   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1593     {
1594       if (org_node == dfa->org_indices[idx]
1595           && constraint == dfa->nodes[idx].constraint)
1596         return idx; /* Found.  */
1597     }
1598   return -1; /* Not found.  */
1599 }
1600
1601 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1602    Return the index of the new node, or -1 if insufficient storage is
1603    available.  */
1604
1605 static int
1606 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1607 {
1608   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1609   if (BE (dup_idx != -1, 1))
1610     {
1611       dfa->nodes[dup_idx].constraint = constraint;
1612       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1613       dfa->nodes[dup_idx].duplicated = 1;
1614
1615       /* Store the index of the original node.  */
1616       dfa->org_indices[dup_idx] = org_idx;
1617     }
1618   return dup_idx;
1619 }
1620
1621 static reg_errcode_t
1622 calc_inveclosure (re_dfa_t *dfa)
1623 {
1624   int src, idx, ret;
1625   for (idx = 0; idx < dfa->nodes_len; ++idx)
1626     re_node_set_init_empty (dfa->inveclosures + idx);
1627
1628   for (src = 0; src < dfa->nodes_len; ++src)
1629     {
1630       int *elems = dfa->eclosures[src].elems;
1631       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1632         {
1633           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1634           if (BE (ret == -1, 0))
1635             return REG_ESPACE;
1636         }
1637     }
1638
1639   return REG_NOERROR;
1640 }
1641
1642 /* Calculate "eclosure" for all the node in DFA.  */
1643
1644 static reg_errcode_t
1645 calc_eclosure (re_dfa_t *dfa)
1646 {
1647   int node_idx, incomplete;
1648 #ifdef DEBUG
1649   assert (dfa->nodes_len > 0);
1650 #endif
1651   incomplete = 0;
1652   /* For each nodes, calculate epsilon closure.  */
1653   for (node_idx = 0; ; ++node_idx)
1654     {
1655       reg_errcode_t err;
1656       re_node_set eclosure_elem;
1657       if (node_idx == dfa->nodes_len)
1658         {
1659           if (!incomplete)
1660             break;
1661           incomplete = 0;
1662           node_idx = 0;
1663         }
1664
1665 #ifdef DEBUG
1666       assert (dfa->eclosures[node_idx].nelem != -1);
1667 #endif
1668
1669       /* If we have already calculated, skip it.  */
1670       if (dfa->eclosures[node_idx].nelem != 0)
1671         continue;
1672       /* Calculate epsilon closure of `node_idx'.  */
1673       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1674       if (BE (err != REG_NOERROR, 0))
1675         return err;
1676
1677       if (dfa->eclosures[node_idx].nelem == 0)
1678         {
1679           incomplete = 1;
1680           re_node_set_free (&eclosure_elem);
1681         }
1682     }
1683   return REG_NOERROR;
1684 }
1685
1686 /* Calculate epsilon closure of NODE.  */
1687
1688 static reg_errcode_t
1689 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1690 {
1691   reg_errcode_t err;
1692   int i;
1693   re_node_set eclosure;
1694   int ret;
1695   int incomplete = 0;
1696   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1697   if (BE (err != REG_NOERROR, 0))
1698     return err;
1699
1700   /* This indicates that we are calculating this node now.
1701      We reference this value to avoid infinite loop.  */
1702   dfa->eclosures[node].nelem = -1;
1703
1704   /* If the current node has constraints, duplicate all nodes
1705      since they must inherit the constraints.  */
1706   if (dfa->nodes[node].constraint
1707       && dfa->edests[node].nelem
1708       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1709     {
1710       err = duplicate_node_closure (dfa, node, node, node,
1711                                     dfa->nodes[node].constraint);
1712       if (BE (err != REG_NOERROR, 0))
1713         return err;
1714     }
1715
1716   /* Expand each epsilon destination nodes.  */
1717   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1718     for (i = 0; i < dfa->edests[node].nelem; ++i)
1719       {
1720         re_node_set eclosure_elem;
1721         int edest = dfa->edests[node].elems[i];
1722         /* If calculating the epsilon closure of `edest' is in progress,
1723            return intermediate result.  */
1724         if (dfa->eclosures[edest].nelem == -1)
1725           {
1726             incomplete = 1;
1727             continue;
1728           }
1729         /* If we haven't calculated the epsilon closure of `edest' yet,
1730            calculate now. Otherwise use calculated epsilon closure.  */
1731         if (dfa->eclosures[edest].nelem == 0)
1732           {
1733             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1734             if (BE (err != REG_NOERROR, 0))
1735               return err;
1736           }
1737         else
1738           eclosure_elem = dfa->eclosures[edest];
1739         /* Merge the epsilon closure of `edest'.  */
1740         err = re_node_set_merge (&eclosure, &eclosure_elem);
1741         if (BE (err != REG_NOERROR, 0))
1742           return err;
1743         /* If the epsilon closure of `edest' is incomplete,
1744            the epsilon closure of this node is also incomplete.  */
1745         if (dfa->eclosures[edest].nelem == 0)
1746           {
1747             incomplete = 1;
1748             re_node_set_free (&eclosure_elem);
1749           }
1750       }
1751
1752   /* An epsilon closure includes itself.  */
1753   ret = re_node_set_insert (&eclosure, node);
1754   if (BE (ret < 0, 0))
1755     return REG_ESPACE;
1756   if (incomplete && !root)
1757     dfa->eclosures[node].nelem = 0;
1758   else
1759     dfa->eclosures[node] = eclosure;
1760   *new_set = eclosure;
1761   return REG_NOERROR;
1762 }
1763 \f
1764 /* Functions for token which are used in the parser.  */
1765
1766 /* Fetch a token from INPUT.
1767    We must not use this function inside bracket expressions.  */
1768
1769 static void
1770 internal_function
1771 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1772 {
1773   re_string_skip_bytes (input, peek_token (result, input, syntax));
1774 }
1775
1776 /* Peek a token from INPUT, and return the length of the token.
1777    We must not use this function inside bracket expressions.  */
1778
1779 static int
1780 internal_function
1781 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1782 {
1783   unsigned char c;
1784
1785   if (re_string_eoi (input))
1786     {
1787       token->type = END_OF_RE;
1788       return 0;
1789     }
1790
1791   c = re_string_peek_byte (input, 0);
1792   token->opr.c = c;
1793
1794   token->word_char = 0;
1795 #ifdef RE_ENABLE_I18N
1796   token->mb_partial = 0;
1797   if (input->mb_cur_max > 1 &&
1798       !re_string_first_byte (input, re_string_cur_idx (input)))
1799     {
1800       token->type = CHARACTER;
1801       token->mb_partial = 1;
1802       return 1;
1803     }
1804 #endif
1805   if (c == '\\')
1806     {
1807       unsigned char c2;
1808       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1809         {
1810           token->type = BACK_SLASH;
1811           return 1;
1812         }
1813
1814       c2 = re_string_peek_byte_case (input, 1);
1815       token->opr.c = c2;
1816       token->type = CHARACTER;
1817 #ifdef RE_ENABLE_I18N
1818       if (input->mb_cur_max > 1)
1819         {
1820           wint_t wc = re_string_wchar_at (input,
1821                                           re_string_cur_idx (input) + 1);
1822           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1823         }
1824       else
1825 #endif
1826         token->word_char = IS_WORD_CHAR (c2) != 0;
1827
1828       switch (c2)
1829         {
1830         case '|':
1831           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1832             token->type = OP_ALT;
1833           break;
1834         case '1': case '2': case '3': case '4': case '5':
1835         case '6': case '7': case '8': case '9':
1836           if (!(syntax & RE_NO_BK_REFS))
1837             {
1838               token->type = OP_BACK_REF;
1839               token->opr.idx = c2 - '1';
1840             }
1841           break;
1842         case '<':
1843           if (!(syntax & RE_NO_GNU_OPS))
1844             {
1845               token->type = ANCHOR;
1846               token->opr.ctx_type = WORD_FIRST;
1847             }
1848           break;
1849         case '>':
1850           if (!(syntax & RE_NO_GNU_OPS))
1851             {
1852               token->type = ANCHOR;
1853               token->opr.ctx_type = WORD_LAST;
1854             }
1855           break;
1856         case 'b':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             {
1859               token->type = ANCHOR;
1860               token->opr.ctx_type = WORD_DELIM;
1861             }
1862           break;
1863         case 'B':
1864           if (!(syntax & RE_NO_GNU_OPS))
1865             {
1866               token->type = ANCHOR;
1867               token->opr.ctx_type = NOT_WORD_DELIM;
1868             }
1869           break;
1870         case 'w':
1871           if (!(syntax & RE_NO_GNU_OPS))
1872             token->type = OP_WORD;
1873           break;
1874         case 'W':
1875           if (!(syntax & RE_NO_GNU_OPS))
1876             token->type = OP_NOTWORD;
1877           break;
1878         case 's':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             token->type = OP_SPACE;
1881           break;
1882         case 'S':
1883           if (!(syntax & RE_NO_GNU_OPS))
1884             token->type = OP_NOTSPACE;
1885           break;
1886         case '`':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             {
1889               token->type = ANCHOR;
1890               token->opr.ctx_type = BUF_FIRST;
1891             }
1892           break;
1893         case '\'':
1894           if (!(syntax & RE_NO_GNU_OPS))
1895             {
1896               token->type = ANCHOR;
1897               token->opr.ctx_type = BUF_LAST;
1898             }
1899           break;
1900         case '(':
1901           if (!(syntax & RE_NO_BK_PARENS))
1902             token->type = OP_OPEN_SUBEXP;
1903           break;
1904         case ')':
1905           if (!(syntax & RE_NO_BK_PARENS))
1906             token->type = OP_CLOSE_SUBEXP;
1907           break;
1908         case '+':
1909           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910             token->type = OP_DUP_PLUS;
1911           break;
1912         case '?':
1913           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914             token->type = OP_DUP_QUESTION;
1915           break;
1916         case '{':
1917           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918             token->type = OP_OPEN_DUP_NUM;
1919           break;
1920         case '}':
1921           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922             token->type = OP_CLOSE_DUP_NUM;
1923           break;
1924         default:
1925           break;
1926         }
1927       return 2;
1928     }
1929
1930   token->type = CHARACTER;
1931 #ifdef RE_ENABLE_I18N
1932   if (input->mb_cur_max > 1)
1933     {
1934       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1935       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1936     }
1937   else
1938 #endif
1939     token->word_char = IS_WORD_CHAR (token->opr.c);
1940
1941   switch (c)
1942     {
1943     case '\n':
1944       if (syntax & RE_NEWLINE_ALT)
1945         token->type = OP_ALT;
1946       break;
1947     case '|':
1948       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1949         token->type = OP_ALT;
1950       break;
1951     case '*':
1952       token->type = OP_DUP_ASTERISK;
1953       break;
1954     case '+':
1955       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956         token->type = OP_DUP_PLUS;
1957       break;
1958     case '?':
1959       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960         token->type = OP_DUP_QUESTION;
1961       break;
1962     case '{':
1963       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964         token->type = OP_OPEN_DUP_NUM;
1965       break;
1966     case '}':
1967       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968         token->type = OP_CLOSE_DUP_NUM;
1969       break;
1970     case '(':
1971       if (syntax & RE_NO_BK_PARENS)
1972         token->type = OP_OPEN_SUBEXP;
1973       break;
1974     case ')':
1975       if (syntax & RE_NO_BK_PARENS)
1976         token->type = OP_CLOSE_SUBEXP;
1977       break;
1978     case '[':
1979       token->type = OP_OPEN_BRACKET;
1980       break;
1981     case '.':
1982       token->type = OP_PERIOD;
1983       break;
1984     case '^':
1985       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1986           re_string_cur_idx (input) != 0)
1987         {
1988           char prev = re_string_peek_byte (input, -1);
1989           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1990             break;
1991         }
1992       token->type = ANCHOR;
1993       token->opr.ctx_type = LINE_FIRST;
1994       break;
1995     case '$':
1996       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1997           re_string_cur_idx (input) + 1 != re_string_length (input))
1998         {
1999           re_token_t next;
2000           re_string_skip_bytes (input, 1);
2001           peek_token (&next, input, syntax);
2002           re_string_skip_bytes (input, -1);
2003           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2004             break;
2005         }
2006       token->type = ANCHOR;
2007       token->opr.ctx_type = LINE_LAST;
2008       break;
2009     default:
2010       break;
2011     }
2012   return 1;
2013 }
2014
2015 /* Peek a token from INPUT, and return the length of the token.
2016    We must not use this function out of bracket expressions.  */
2017
2018 static int
2019 internal_function
2020 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2021 {
2022   unsigned char c;
2023   if (re_string_eoi (input))
2024     {
2025       token->type = END_OF_RE;
2026       return 0;
2027     }
2028   c = re_string_peek_byte (input, 0);
2029   token->opr.c = c;
2030
2031 #ifdef RE_ENABLE_I18N
2032   if (input->mb_cur_max > 1 &&
2033       !re_string_first_byte (input, re_string_cur_idx (input)))
2034     {
2035       token->type = CHARACTER;
2036       return 1;
2037     }
2038 #endif /* RE_ENABLE_I18N */
2039
2040   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2041       && re_string_cur_idx (input) + 1 < re_string_length (input))
2042     {
2043       /* In this case, '\' escape a character.  */
2044       unsigned char c2;
2045       re_string_skip_bytes (input, 1);
2046       c2 = re_string_peek_byte (input, 0);
2047       token->opr.c = c2;
2048       token->type = CHARACTER;
2049       return 1;
2050     }
2051   if (c == '[') /* '[' is a special char in a bracket exps.  */
2052     {
2053       unsigned char c2;
2054       int token_len;
2055       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2056         c2 = re_string_peek_byte (input, 1);
2057       else
2058         c2 = 0;
2059       token->opr.c = c2;
2060       token_len = 2;
2061       switch (c2)
2062         {
2063         case '.':
2064           token->type = OP_OPEN_COLL_ELEM;
2065           break;
2066         case '=':
2067           token->type = OP_OPEN_EQUIV_CLASS;
2068           break;
2069         case ':':
2070           if (syntax & RE_CHAR_CLASSES)
2071             {
2072               token->type = OP_OPEN_CHAR_CLASS;
2073               break;
2074             }
2075           /* else fall through.  */
2076         default:
2077           token->type = CHARACTER;
2078           token->opr.c = c;
2079           token_len = 1;
2080           break;
2081         }
2082       return token_len;
2083     }
2084   switch (c)
2085     {
2086     case '-':
2087       token->type = OP_CHARSET_RANGE;
2088       break;
2089     case ']':
2090       token->type = OP_CLOSE_BRACKET;
2091       break;
2092     case '^':
2093       token->type = OP_NON_MATCH_LIST;
2094       break;
2095     default:
2096       token->type = CHARACTER;
2097     }
2098   return 1;
2099 }
2100 \f
2101 /* Functions for parser.  */
2102
2103 /* Entry point of the parser.
2104    Parse the regular expression REGEXP and return the structure tree.
2105    If an error has occurred, ERR is set by error code, and return NULL.
2106    This function build the following tree, from regular expression <reg_exp>:
2107            CAT
2108            / \
2109           /   \
2110    <reg_exp>  EOR
2111
2112    CAT means concatenation.
2113    EOR means end of regular expression.  */
2114
2115 static bin_tree_t *
2116 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2117        reg_errcode_t *err)
2118 {
2119   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2120   bin_tree_t *tree, *eor, *root;
2121   re_token_t current_token;
2122   dfa->syntax = syntax;
2123   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2124   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2125   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126     return NULL;
2127   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2128   if (tree != NULL)
2129     root = create_tree (dfa, tree, eor, CONCAT);
2130   else
2131     root = eor;
2132   if (BE (eor == NULL || root == NULL, 0))
2133     {
2134       *err = REG_ESPACE;
2135       return NULL;
2136     }
2137   return root;
2138 }
2139
2140 /* This function build the following tree, from regular expression
2141    <branch1>|<branch2>:
2142            ALT
2143            / \
2144           /   \
2145    <branch1> <branch2>
2146
2147    ALT means alternative, which represents the operator `|'.  */
2148
2149 static bin_tree_t *
2150 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2151                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2152 {
2153   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2154   bin_tree_t *tree, *branch = NULL;
2155   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2156   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2157     return NULL;
2158
2159   while (token->type == OP_ALT)
2160     {
2161       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2162       if (token->type != OP_ALT && token->type != END_OF_RE
2163           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2164         {
2165           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2166           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2167             return NULL;
2168         }
2169       else
2170         branch = NULL;
2171       tree = create_tree (dfa, tree, branch, OP_ALT);
2172       if (BE (tree == NULL, 0))
2173         {
2174           *err = REG_ESPACE;
2175           return NULL;
2176         }
2177     }
2178   return tree;
2179 }
2180
2181 /* This function build the following tree, from regular expression
2182    <exp1><exp2>:
2183         CAT
2184         / \
2185        /   \
2186    <exp1> <exp2>
2187
2188    CAT means concatenation.  */
2189
2190 static bin_tree_t *
2191 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2192               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2193 {
2194   bin_tree_t *tree, *exp;
2195   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2196   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2197   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2198     return NULL;
2199
2200   while (token->type != OP_ALT && token->type != END_OF_RE
2201          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2202     {
2203       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2204       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2205         {
2206           return NULL;
2207         }
2208       if (tree != NULL && exp != NULL)
2209         {
2210           tree = create_tree (dfa, tree, exp, CONCAT);
2211           if (tree == NULL)
2212             {
2213               *err = REG_ESPACE;
2214               return NULL;
2215             }
2216         }
2217       else if (tree == NULL)
2218         tree = exp;
2219       /* Otherwise exp == NULL, we don't need to create new tree.  */
2220     }
2221   return tree;
2222 }
2223
2224 /* This function build the following tree, from regular expression a*:
2225          *
2226          |
2227          a
2228 */
2229
2230 static bin_tree_t *
2231 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2232                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2233 {
2234   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2235   bin_tree_t *tree;
2236   switch (token->type)
2237     {
2238     case CHARACTER:
2239       tree = create_token_tree (dfa, NULL, NULL, token);
2240       if (BE (tree == NULL, 0))
2241         {
2242           *err = REG_ESPACE;
2243           return NULL;
2244         }
2245 #ifdef RE_ENABLE_I18N
2246       if (dfa->mb_cur_max > 1)
2247         {
2248           while (!re_string_eoi (regexp)
2249                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2250             {
2251               bin_tree_t *mbc_remain;
2252               fetch_token (token, regexp, syntax);
2253               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2254               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2255               if (BE (mbc_remain == NULL || tree == NULL, 0))
2256                 {
2257                   *err = REG_ESPACE;
2258                   return NULL;
2259                 }
2260             }
2261         }
2262 #endif
2263       break;
2264     case OP_OPEN_SUBEXP:
2265       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2266       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267         return NULL;
2268       break;
2269     case OP_OPEN_BRACKET:
2270       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2271       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2272         return NULL;
2273       break;
2274     case OP_BACK_REF:
2275       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2276         {
2277           *err = REG_ESUBREG;
2278           return NULL;
2279         }
2280       dfa->used_bkref_map |= 1 << token->opr.idx;
2281       tree = create_token_tree (dfa, NULL, NULL, token);
2282       if (BE (tree == NULL, 0))
2283         {
2284           *err = REG_ESPACE;
2285           return NULL;
2286         }
2287       ++dfa->nbackref;
2288       dfa->has_mb_node = 1;
2289       break;
2290     case OP_OPEN_DUP_NUM:
2291       if (syntax & RE_CONTEXT_INVALID_DUP)
2292         {
2293           *err = REG_BADRPT;
2294           return NULL;
2295         }
2296       /* FALLTHROUGH */
2297     case OP_DUP_ASTERISK:
2298     case OP_DUP_PLUS:
2299     case OP_DUP_QUESTION:
2300       if (syntax & RE_CONTEXT_INVALID_OPS)
2301         {
2302           *err = REG_BADRPT;
2303           return NULL;
2304         }
2305       else if (syntax & RE_CONTEXT_INDEP_OPS)
2306         {
2307           fetch_token (token, regexp, syntax);
2308           return parse_expression (regexp, preg, token, syntax, nest, err);
2309         }
2310       /* else fall through  */
2311     case OP_CLOSE_SUBEXP:
2312       if ((token->type == OP_CLOSE_SUBEXP) &&
2313           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2314         {
2315           *err = REG_ERPAREN;
2316           return NULL;
2317         }
2318       /* else fall through  */
2319     case OP_CLOSE_DUP_NUM:
2320       /* We treat it as a normal character.  */
2321
2322       /* Then we can these characters as normal characters.  */
2323       token->type = CHARACTER;
2324       /* mb_partial and word_char bits should be initialized already
2325          by peek_token.  */
2326       tree = create_token_tree (dfa, NULL, NULL, token);
2327       if (BE (tree == NULL, 0))
2328         {
2329           *err = REG_ESPACE;
2330           return NULL;
2331         }
2332       break;
2333     case ANCHOR:
2334       if ((token->opr.ctx_type
2335            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2336           && dfa->word_ops_used == 0)
2337         init_word_char (dfa);
2338       if (token->opr.ctx_type == WORD_DELIM
2339           || token->opr.ctx_type == NOT_WORD_DELIM)
2340         {
2341           bin_tree_t *tree_first, *tree_last;
2342           if (token->opr.ctx_type == WORD_DELIM)
2343             {
2344               token->opr.ctx_type = WORD_FIRST;
2345               tree_first = create_token_tree (dfa, NULL, NULL, token);
2346               token->opr.ctx_type = WORD_LAST;
2347             }
2348           else
2349             {
2350               token->opr.ctx_type = INSIDE_WORD;
2351               tree_first = create_token_tree (dfa, NULL, NULL, token);
2352               token->opr.ctx_type = INSIDE_NOTWORD;
2353             }
2354           tree_last = create_token_tree (dfa, NULL, NULL, token);
2355           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2356           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2357             {
2358               *err = REG_ESPACE;
2359               return NULL;
2360             }
2361         }
2362       else
2363         {
2364           tree = create_token_tree (dfa, NULL, NULL, token);
2365           if (BE (tree == NULL, 0))
2366             {
2367               *err = REG_ESPACE;
2368               return NULL;
2369             }
2370         }
2371       /* We must return here, since ANCHORs can't be followed
2372          by repetition operators.
2373          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2374              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2375       fetch_token (token, regexp, syntax);
2376       return tree;
2377     case OP_PERIOD:
2378       tree = create_token_tree (dfa, NULL, NULL, token);
2379       if (BE (tree == NULL, 0))
2380         {
2381           *err = REG_ESPACE;
2382           return NULL;
2383         }
2384       if (dfa->mb_cur_max > 1)
2385         dfa->has_mb_node = 1;
2386       break;
2387     case OP_WORD:
2388     case OP_NOTWORD:
2389       tree = build_charclass_op (dfa, regexp->trans,
2390                                  "alnum",
2391                                  "_",
2392                                  token->type == OP_NOTWORD, err);
2393       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394         return NULL;
2395       break;
2396     case OP_SPACE:
2397     case OP_NOTSPACE:
2398       tree = build_charclass_op (dfa, regexp->trans,
2399                                  "space",
2400                                  "",
2401                                  token->type == OP_NOTSPACE, err);
2402       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2403         return NULL;
2404       break;
2405     case OP_ALT:
2406     case END_OF_RE:
2407       return NULL;
2408     case BACK_SLASH:
2409       *err = REG_EESCAPE;
2410       return NULL;
2411     default:
2412       /* Must not happen?  */
2413 #ifdef DEBUG
2414       assert (0);
2415 #endif
2416       return NULL;
2417     }
2418   fetch_token (token, regexp, syntax);
2419
2420   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2421          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2422     {
2423       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2424       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2425         return NULL;
2426       /* In BRE consecutive duplications are not allowed.  */
2427       if ((syntax & RE_CONTEXT_INVALID_DUP)
2428           && (token->type == OP_DUP_ASTERISK
2429               || token->type == OP_OPEN_DUP_NUM))
2430         {
2431           *err = REG_BADRPT;
2432           return NULL;
2433         }
2434     }
2435
2436   return tree;
2437 }
2438
2439 /* This function build the following tree, from regular expression
2440    (<reg_exp>):
2441          SUBEXP
2442             |
2443         <reg_exp>
2444 */
2445
2446 static bin_tree_t *
2447 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2448                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2449 {
2450   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2451   bin_tree_t *tree;
2452   size_t cur_nsub;
2453   cur_nsub = preg->re_nsub++;
2454
2455   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2456
2457   /* The subexpression may be a null string.  */
2458   if (token->type == OP_CLOSE_SUBEXP)
2459     tree = NULL;
2460   else
2461     {
2462       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2463       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2464         *err = REG_EPAREN;
2465       if (BE (*err != REG_NOERROR, 0))
2466         return NULL;
2467     }
2468
2469   if (cur_nsub <= '9' - '1')
2470     dfa->completed_bkref_map |= 1 << cur_nsub;
2471
2472   tree = create_tree (dfa, tree, NULL, SUBEXP);
2473   if (BE (tree == NULL, 0))
2474     {
2475       *err = REG_ESPACE;
2476       return NULL;
2477     }
2478   tree->token.opr.idx = cur_nsub;
2479   return tree;
2480 }
2481
2482 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2483
2484 static bin_tree_t *
2485 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2486               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2487 {
2488   bin_tree_t *tree = NULL, *old_tree = NULL;
2489   int i, start, end, start_idx = re_string_cur_idx (regexp);
2490 #ifndef RE_TOKEN_INIT_BUG
2491   re_token_t start_token = *token;
2492 #else
2493   re_token_t start_token;
2494
2495   memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2496 #endif
2497
2498   if (token->type == OP_OPEN_DUP_NUM)
2499     {
2500       end = 0;
2501       start = fetch_number (regexp, token, syntax);
2502       if (start == -1)
2503         {
2504           if (token->type == CHARACTER && token->opr.c == ',')
2505             start = 0; /* We treat "{,m}" as "{0,m}".  */
2506           else
2507             {
2508               *err = REG_BADBR; /* <re>{} is invalid.  */
2509               return NULL;
2510             }
2511         }
2512       if (BE (start != -2, 1))
2513         {
2514           /* We treat "{n}" as "{n,n}".  */
2515           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2516                  : ((token->type == CHARACTER && token->opr.c == ',')
2517                     ? fetch_number (regexp, token, syntax) : -2));
2518         }
2519       if (BE (start == -2 || end == -2, 0))
2520         {
2521           /* Invalid sequence.  */
2522           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2523             {
2524               if (token->type == END_OF_RE)
2525                 *err = REG_EBRACE;
2526               else
2527                 *err = REG_BADBR;
2528
2529               return NULL;
2530             }
2531
2532           /* If the syntax bit is set, rollback.  */
2533           re_string_set_index (regexp, start_idx);
2534           *token = start_token;
2535           token->type = CHARACTER;
2536           /* mb_partial and word_char bits should be already initialized by
2537              peek_token.  */
2538           return elem;
2539         }
2540
2541       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2542         {
2543           /* First number greater than second.  */
2544           *err = REG_BADBR;
2545           return NULL;
2546         }
2547     }
2548   else
2549     {
2550       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2552     }
2553
2554   fetch_token (token, regexp, syntax);
2555
2556   if (BE (elem == NULL, 0))
2557     return NULL;
2558   if (BE (start == 0 && end == 0, 0))
2559     {
2560       postorder (elem, free_tree, NULL);
2561       return NULL;
2562     }
2563
2564   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2565   if (BE (start > 0, 0))
2566     {
2567       tree = elem;
2568       for (i = 2; i <= start; ++i)
2569         {
2570           elem = duplicate_tree (elem, dfa);
2571           tree = create_tree (dfa, tree, elem, CONCAT);
2572           if (BE (elem == NULL || tree == NULL, 0))
2573             goto parse_dup_op_espace;
2574         }
2575
2576       if (start == end)
2577         return tree;
2578
2579       /* Duplicate ELEM before it is marked optional.  */
2580       elem = duplicate_tree (elem, dfa);
2581       old_tree = tree;
2582     }
2583   else
2584     old_tree = NULL;
2585
2586   if (elem->token.type == SUBEXP)
2587     postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2588
2589   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2590   if (BE (tree == NULL, 0))
2591     goto parse_dup_op_espace;
2592
2593   /* This loop is actually executed only when end != -1,
2594      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2595      already created the start+1-th copy.  */
2596   for (i = start + 2; i <= end; ++i)
2597     {
2598       elem = duplicate_tree (elem, dfa);
2599       tree = create_tree (dfa, tree, elem, CONCAT);
2600       if (BE (elem == NULL || tree == NULL, 0))
2601         goto parse_dup_op_espace;
2602
2603       tree = create_tree (dfa, tree, NULL, OP_ALT);
2604       if (BE (tree == NULL, 0))
2605         goto parse_dup_op_espace;
2606     }
2607
2608   if (old_tree)
2609     tree = create_tree (dfa, old_tree, tree, CONCAT);
2610
2611   return tree;
2612
2613  parse_dup_op_espace:
2614   *err = REG_ESPACE;
2615   return NULL;
2616 }
2617
2618 /* Size of the names for collating symbol/equivalence_class/character_class.
2619    I'm not sure, but maybe enough.  */
2620 #define BRACKET_NAME_BUF_SIZE 32
2621
2622 #ifndef _LIBC
2623   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2624      Build the range expression which starts from START_ELEM, and ends
2625      at END_ELEM.  The result are written to MBCSET and SBCSET.
2626      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2627      mbcset->range_ends, is a pointer argument since we may
2628      update it.  */
2629
2630 static reg_errcode_t
2631 internal_function
2632 # ifdef RE_ENABLE_I18N
2633 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2634                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2635 # else /* not RE_ENABLE_I18N */
2636 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2637                  bracket_elem_t *end_elem)
2638 # endif /* not RE_ENABLE_I18N */
2639 {
2640   unsigned int start_ch, end_ch;
2641   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2642   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2643           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2644           0))
2645     return REG_ERANGE;
2646
2647   /* We can handle no multi character collating elements without libc
2648      support.  */
2649   if (BE ((start_elem->type == COLL_SYM
2650            && strlen ((char *) start_elem->opr.name) > 1)
2651           || (end_elem->type == COLL_SYM
2652               && strlen ((char *) end_elem->opr.name) > 1), 0))
2653     return REG_ECOLLATE;
2654
2655 # ifdef RE_ENABLE_I18N
2656   {
2657     wchar_t wc;
2658     wint_t start_wc;
2659     wint_t end_wc;
2660     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2661
2662     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2663                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2664                    : 0));
2665     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2666               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2667                  : 0));
2668 #ifdef GAWK
2669     /*
2670      * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2671      * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2672      * unsigned, so we don't have sign extension problems.
2673      */
2674     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2675                 ? start_ch : start_elem->opr.wch);
2676     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2677               ? end_ch : end_elem->opr.wch);
2678 #else
2679     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2680                 ? __btowc (start_ch) : start_elem->opr.wch);
2681     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2682               ? __btowc (end_ch) : end_elem->opr.wch);
2683 #endif
2684     if (start_wc == WEOF || end_wc == WEOF)
2685       return REG_ECOLLATE;
2686     cmp_buf[0] = start_wc;
2687     cmp_buf[4] = end_wc;
2688     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2689       return REG_ERANGE;
2690
2691     /* Got valid collation sequence values, add them as a new entry.
2692        However, for !_LIBC we have no collation elements: if the
2693        character set is single byte, the single byte character set
2694        that we build below suffices.  parse_bracket_exp passes
2695        no MBCSET if dfa->mb_cur_max == 1.  */
2696     if (mbcset)
2697       {
2698         /* Check the space of the arrays.  */
2699         if (BE (*range_alloc == mbcset->nranges, 0))
2700           {
2701             /* There is not enough space, need realloc.  */
2702             wchar_t *new_array_start, *new_array_end;
2703             int new_nranges;
2704
2705             /* +1 in case of mbcset->nranges is 0.  */
2706             new_nranges = 2 * mbcset->nranges + 1;
2707             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2708                are NULL if *range_alloc == 0.  */
2709             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2710                                           new_nranges);
2711             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2712                                         new_nranges);
2713
2714             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2715               return REG_ESPACE;
2716
2717             mbcset->range_starts = new_array_start;
2718             mbcset->range_ends = new_array_end;
2719             *range_alloc = new_nranges;
2720           }
2721
2722         mbcset->range_starts[mbcset->nranges] = start_wc;
2723         mbcset->range_ends[mbcset->nranges++] = end_wc;
2724       }
2725
2726     /* Build the table for single byte characters.  */
2727     for (wc = 0; wc < SBC_MAX; ++wc)
2728       {
2729         cmp_buf[2] = wc;
2730         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2731             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2732           bitset_set (sbcset, wc);
2733       }
2734   }
2735 # else /* not RE_ENABLE_I18N */
2736   {
2737     unsigned int ch;
2738     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2739                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2740                    : 0));
2741     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2742               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2743                  : 0));
2744     if (start_ch > end_ch)
2745       return REG_ERANGE;
2746     /* Build the table for single byte characters.  */
2747     for (ch = 0; ch < SBC_MAX; ++ch)
2748       if (start_ch <= ch  && ch <= end_ch)
2749         bitset_set (sbcset, ch);
2750   }
2751 # endif /* not RE_ENABLE_I18N */
2752   return REG_NOERROR;
2753 }
2754 #endif /* not _LIBC */
2755
2756 #ifndef _LIBC
2757 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2758    Build the collating element which is represented by NAME.
2759    The result are written to MBCSET and SBCSET.
2760    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2761    pointer argument since we may update it.  */
2762
2763 static reg_errcode_t
2764 internal_function
2765 # ifdef RE_ENABLE_I18N
2766 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2767                         int *coll_sym_alloc, const unsigned char *name)
2768 # else /* not RE_ENABLE_I18N */
2769 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2770 # endif /* not RE_ENABLE_I18N */
2771 {
2772   size_t name_len = strlen ((const char *) name);
2773   if (BE (name_len != 1, 0))
2774     return REG_ECOLLATE;
2775   else
2776     {
2777       bitset_set (sbcset, name[0]);
2778       return REG_NOERROR;
2779     }
2780 }
2781 #endif /* not _LIBC */
2782
2783 /* This function parse bracket expression like "[abc]", "[a-c]",
2784    "[[.a-a.]]" etc.  */
2785
2786 static bin_tree_t *
2787 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2788                    reg_syntax_t syntax, reg_errcode_t *err)
2789 {
2790 #ifdef _LIBC
2791   const unsigned char *collseqmb;
2792   const char *collseqwc;
2793   uint32_t nrules;
2794   int32_t table_size;
2795   const int32_t *symb_table;
2796   const unsigned char *extra;
2797
2798   /* Local function for parse_bracket_exp used in _LIBC environment.
2799      Seek the collating symbol entry correspondings to NAME.
2800      Return the index of the symbol in the SYMB_TABLE.  */
2801
2802   auto inline int32_t
2803   __attribute ((always_inline))
2804   seek_collating_symbol_entry (name, name_len)
2805          const unsigned char *name;
2806          size_t name_len;
2807     {
2808       int32_t hash = elem_hash ((const char *) name, name_len);
2809       int32_t elem = hash % table_size;
2810       if (symb_table[2 * elem] != 0)
2811         {
2812           int32_t second = hash % (table_size - 2) + 1;
2813
2814           do
2815             {
2816               /* First compare the hashing value.  */
2817               if (symb_table[2 * elem] == hash
2818                   /* Compare the length of the name.  */
2819                   && name_len == extra[symb_table[2 * elem + 1]]
2820                   /* Compare the name.  */
2821                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2822                              name_len) == 0)
2823                 {
2824                   /* Yep, this is the entry.  */
2825                   break;
2826                 }
2827
2828               /* Next entry.  */
2829               elem += second;
2830             }
2831           while (symb_table[2 * elem] != 0);
2832         }
2833       return elem;
2834     }
2835
2836   /* Local function for parse_bracket_exp used in _LIBC environment.
2837      Look up the collation sequence value of BR_ELEM.
2838      Return the value if succeeded, UINT_MAX otherwise.  */
2839
2840   auto inline unsigned int
2841   __attribute ((always_inline))
2842   lookup_collation_sequence_value (br_elem)
2843          bracket_elem_t *br_elem;
2844     {
2845       if (br_elem->type == SB_CHAR)
2846         {
2847           /*
2848           if (MB_CUR_MAX == 1)
2849           */
2850           if (nrules == 0)
2851             return collseqmb[br_elem->opr.ch];
2852           else
2853             {
2854               wint_t wc = __btowc (br_elem->opr.ch);
2855               return __collseq_table_lookup (collseqwc, wc);
2856             }
2857         }
2858       else if (br_elem->type == MB_CHAR)
2859         {
2860           if (nrules != 0)
2861             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2862         }
2863       else if (br_elem->type == COLL_SYM)
2864         {
2865           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2866           if (nrules != 0)
2867             {
2868               int32_t elem, idx;
2869               elem = seek_collating_symbol_entry (br_elem->opr.name,
2870                                                   sym_name_len);
2871               if (symb_table[2 * elem] != 0)
2872                 {
2873                   /* We found the entry.  */
2874                   idx = symb_table[2 * elem + 1];
2875                   /* Skip the name of collating element name.  */
2876                   idx += 1 + extra[idx];
2877                   /* Skip the byte sequence of the collating element.  */
2878                   idx += 1 + extra[idx];
2879                   /* Adjust for the alignment.  */
2880                   idx = (idx + 3) & ~3;
2881                   /* Skip the multibyte collation sequence value.  */
2882                   idx += sizeof (unsigned int);
2883                   /* Skip the wide char sequence of the collating element.  */
2884                   idx += sizeof (unsigned int) *
2885                     (1 + *(unsigned int *) (extra + idx));
2886                   /* Return the collation sequence value.  */
2887                   return *(unsigned int *) (extra + idx);
2888                 }
2889               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2890                 {
2891                   /* No valid character.  Match it as a single byte
2892                      character.  */
2893                   return collseqmb[br_elem->opr.name[0]];
2894                 }
2895             }
2896           else if (sym_name_len == 1)
2897             return collseqmb[br_elem->opr.name[0]];
2898         }
2899       return UINT_MAX;
2900     }
2901
2902   /* Local function for parse_bracket_exp used in _LIBC environment.
2903      Build the range expression which starts from START_ELEM, and ends
2904      at END_ELEM.  The result are written to MBCSET and SBCSET.
2905      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2906      mbcset->range_ends, is a pointer argument since we may
2907      update it.  */
2908
2909   auto inline reg_errcode_t
2910   __attribute ((always_inline))
2911   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2912          re_charset_t *mbcset;
2913          int *range_alloc;
2914          bitset_t sbcset;
2915          bracket_elem_t *start_elem, *end_elem;
2916     {
2917       unsigned int ch;
2918       uint32_t start_collseq;
2919       uint32_t end_collseq;
2920
2921       /* Equivalence Classes and Character Classes can't be a range
2922          start/end.  */
2923       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2924               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2925               0))
2926         return REG_ERANGE;
2927
2928       start_collseq = lookup_collation_sequence_value (start_elem);
2929       end_collseq = lookup_collation_sequence_value (end_elem);
2930       /* Check start/end collation sequence values.  */
2931       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2932         return REG_ECOLLATE;
2933       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2934         return REG_ERANGE;
2935
2936       /* Got valid collation sequence values, add them as a new entry.
2937          However, if we have no collation elements, and the character set
2938          is single byte, the single byte character set that we
2939          build below suffices. */
2940       if (nrules > 0 || dfa->mb_cur_max > 1)
2941         {
2942           /* Check the space of the arrays.  */
2943           if (BE (*range_alloc == mbcset->nranges, 0))
2944             {
2945               /* There is not enough space, need realloc.  */
2946               uint32_t *new_array_start;
2947               uint32_t *new_array_end;
2948               int new_nranges;
2949
2950               /* +1 in case of mbcset->nranges is 0.  */
2951               new_nranges = 2 * mbcset->nranges + 1;
2952               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2953                                             new_nranges);
2954               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2955                                           new_nranges);
2956
2957               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2958                 return REG_ESPACE;
2959
2960               mbcset->range_starts = new_array_start;
2961               mbcset->range_ends = new_array_end;
2962               *range_alloc = new_nranges;
2963             }
2964
2965           mbcset->range_starts[mbcset->nranges] = start_collseq;
2966           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2967         }
2968
2969       /* Build the table for single byte characters.  */
2970       for (ch = 0; ch < SBC_MAX; ch++)
2971         {
2972           uint32_t ch_collseq;
2973           /*
2974           if (MB_CUR_MAX == 1)
2975           */
2976           if (nrules == 0)
2977             ch_collseq = collseqmb[ch];
2978           else
2979             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2980           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2981             bitset_set (sbcset, ch);
2982         }
2983       return REG_NOERROR;
2984     }
2985
2986   /* Local function for parse_bracket_exp used in _LIBC environment.
2987      Build the collating element which is represented by NAME.
2988      The result are written to MBCSET and SBCSET.
2989      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2990      pointer argument since we may update it.  */
2991
2992   auto inline reg_errcode_t
2993   __attribute ((always_inline))
2994   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2995          re_charset_t *mbcset;
2996          int *coll_sym_alloc;
2997          bitset_t sbcset;
2998          const unsigned char *name;
2999     {
3000       int32_t elem, idx;
3001       size_t name_len = strlen ((const char *) name);
3002       if (nrules != 0)
3003         {
3004           elem = seek_collating_symbol_entry (name, name_len);
3005           if (symb_table[2 * elem] != 0)
3006             {
3007               /* We found the entry.  */
3008               idx = symb_table[2 * elem + 1];
3009               /* Skip the name of collating element name.  */
3010               idx += 1 + extra[idx];
3011             }
3012           else if (symb_table[2 * elem] == 0 && name_len == 1)
3013             {
3014               /* No valid character, treat it as a normal
3015                  character.  */
3016               bitset_set (sbcset, name[0]);
3017               return REG_NOERROR;
3018             }
3019           else
3020             return REG_ECOLLATE;
3021
3022           /* Got valid collation sequence, add it as a new entry.  */
3023           /* Check the space of the arrays.  */
3024           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3025             {
3026               /* Not enough, realloc it.  */
3027               /* +1 in case of mbcset->ncoll_syms is 0.  */
3028               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3029               /* Use realloc since mbcset->coll_syms is NULL
3030                  if *alloc == 0.  */
3031               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3032                                                    new_coll_sym_alloc);
3033               if (BE (new_coll_syms == NULL, 0))
3034                 return REG_ESPACE;
3035               mbcset->coll_syms = new_coll_syms;
3036               *coll_sym_alloc = new_coll_sym_alloc;
3037             }
3038           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3039           return REG_NOERROR;
3040         }
3041       else
3042         {
3043           if (BE (name_len != 1, 0))
3044             return REG_ECOLLATE;
3045           else
3046             {
3047               bitset_set (sbcset, name[0]);
3048               return REG_NOERROR;
3049             }
3050         }
3051     }
3052 #endif
3053
3054   re_token_t br_token;
3055   re_bitset_ptr_t sbcset;
3056 #ifdef RE_ENABLE_I18N
3057   re_charset_t *mbcset;
3058   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3059   int equiv_class_alloc = 0, char_class_alloc = 0;
3060 #endif /* not RE_ENABLE_I18N */
3061   int non_match = 0;
3062   bin_tree_t *work_tree;
3063   int token_len;
3064   int first_round = 1;
3065 #ifdef _LIBC
3066   collseqmb = (const unsigned char *)
3067     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3068   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3069   if (nrules)
3070     {
3071       /*
3072       if (MB_CUR_MAX > 1)
3073       */
3074       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3075       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3076       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3077                                                   _NL_COLLATE_SYMB_TABLEMB);
3078       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3079                                                    _NL_COLLATE_SYMB_EXTRAMB);
3080     }
3081 #endif
3082   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3083 #ifdef RE_ENABLE_I18N
3084   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3085 #endif /* RE_ENABLE_I18N */
3086 #ifdef RE_ENABLE_I18N
3087   if (BE (sbcset == NULL || mbcset == NULL, 0))
3088 #else
3089   if (BE (sbcset == NULL, 0))
3090 #endif /* RE_ENABLE_I18N */
3091     {
3092       *err = REG_ESPACE;
3093       return NULL;
3094     }
3095
3096   token_len = peek_token_bracket (token, regexp, syntax);
3097   if (BE (token->type == END_OF_RE, 0))
3098     {
3099       *err = REG_BADPAT;
3100       goto parse_bracket_exp_free_return;
3101     }
3102   if (token->type == OP_NON_MATCH_LIST)
3103     {
3104 #ifdef RE_ENABLE_I18N
3105       mbcset->non_match = 1;
3106 #endif /* not RE_ENABLE_I18N */
3107       non_match = 1;
3108       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3109         bitset_set (sbcset, '\n');
3110       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3111       token_len = peek_token_bracket (token, regexp, syntax);
3112       if (BE (token->type == END_OF_RE, 0))
3113         {
3114           *err = REG_BADPAT;
3115           goto parse_bracket_exp_free_return;
3116         }
3117     }
3118
3119   /* We treat the first ']' as a normal character.  */
3120   if (token->type == OP_CLOSE_BRACKET)
3121     token->type = CHARACTER;
3122
3123   while (1)
3124     {
3125       bracket_elem_t start_elem, end_elem;
3126       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3127       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3128       reg_errcode_t ret;
3129       int token_len2 = 0, is_range_exp = 0;
3130       re_token_t token2;
3131
3132       start_elem.opr.name = start_name_buf;
3133       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3134                                    syntax, first_round);
3135       if (BE (ret != REG_NOERROR, 0))
3136         {
3137           *err = ret;
3138           goto parse_bracket_exp_free_return;
3139         }
3140       first_round = 0;
3141
3142       /* Get information about the next token.  We need it in any case.  */
3143       token_len = peek_token_bracket (token, regexp, syntax);
3144
3145       /* Do not check for ranges if we know they are not allowed.  */
3146       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3147         {
3148           if (BE (token->type == END_OF_RE, 0))
3149             {
3150               *err = REG_EBRACK;
3151               goto parse_bracket_exp_free_return;
3152             }
3153           if (token->type == OP_CHARSET_RANGE)
3154             {
3155               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3156               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3157               if (BE (token2.type == END_OF_RE, 0))
3158                 {
3159                   *err = REG_EBRACK;
3160                   goto parse_bracket_exp_free_return;
3161                 }
3162               if (token2.type == OP_CLOSE_BRACKET)
3163                 {
3164                   /* We treat the last '-' as a normal character.  */
3165                   re_string_skip_bytes (regexp, -token_len);
3166                   token->type = CHARACTER;
3167                 }
3168               else
3169                 is_range_exp = 1;
3170             }
3171         }
3172
3173       if (is_range_exp == 1)
3174         {
3175           end_elem.opr.name = end_name_buf;
3176           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3177                                        dfa, syntax, 1);
3178           if (BE (ret != REG_NOERROR, 0))
3179             {
3180               *err = ret;
3181               goto parse_bracket_exp_free_return;
3182             }
3183
3184           token_len = peek_token_bracket (token, regexp, syntax);
3185
3186 #ifdef _LIBC
3187           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3188                                   &start_elem, &end_elem);
3189 #else
3190 # ifdef RE_ENABLE_I18N
3191           *err = build_range_exp (sbcset,
3192                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3193                                   &range_alloc, &start_elem, &end_elem);
3194 # else
3195           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3196 # endif
3197 #endif /* RE_ENABLE_I18N */
3198           if (BE (*err != REG_NOERROR, 0))
3199             goto parse_bracket_exp_free_return;
3200         }
3201       else
3202         {
3203           switch (start_elem.type)
3204             {
3205             case SB_CHAR:
3206               bitset_set (sbcset, start_elem.opr.ch);
3207               break;
3208 #ifdef RE_ENABLE_I18N
3209             case MB_CHAR:
3210               /* Check whether the array has enough space.  */
3211               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3212                 {
3213                   wchar_t *new_mbchars;
3214                   /* Not enough, realloc it.  */
3215                   /* +1 in case of mbcset->nmbchars is 0.  */
3216                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3217                   /* Use realloc since array is NULL if *alloc == 0.  */
3218                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3219                                             mbchar_alloc);
3220                   if (BE (new_mbchars == NULL, 0))
3221                     goto parse_bracket_exp_espace;
3222                   mbcset->mbchars = new_mbchars;
3223                 }
3224               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3225               break;
3226 #endif /* RE_ENABLE_I18N */
3227             case EQUIV_CLASS:
3228               *err = build_equiv_class (sbcset,
3229 #ifdef RE_ENABLE_I18N
3230                                         mbcset, &equiv_class_alloc,
3231 #endif /* RE_ENABLE_I18N */
3232                                         start_elem.opr.name);
3233               if (BE (*err != REG_NOERROR, 0))
3234                 goto parse_bracket_exp_free_return;
3235               break;
3236             case COLL_SYM:
3237               *err = build_collating_symbol (sbcset,
3238 #ifdef RE_ENABLE_I18N
3239                                              mbcset, &coll_sym_alloc,
3240 #endif /* RE_ENABLE_I18N */
3241                                              start_elem.opr.name);
3242               if (BE (*err != REG_NOERROR, 0))
3243                 goto parse_bracket_exp_free_return;
3244               break;
3245             case CHAR_CLASS:
3246               *err = build_charclass (regexp->trans, sbcset,
3247 #ifdef RE_ENABLE_I18N
3248                                       mbcset, &char_class_alloc,
3249 #endif /* RE_ENABLE_I18N */
3250                                       (const char *) start_elem.opr.name, syntax);
3251               if (BE (*err != REG_NOERROR, 0))
3252                goto parse_bracket_exp_free_return;
3253               break;
3254             default:
3255               assert (0);
3256               break;
3257             }
3258         }
3259       if (BE (token->type == END_OF_RE, 0))
3260         {
3261           *err = REG_EBRACK;
3262           goto parse_bracket_exp_free_return;
3263         }
3264       if (token->type == OP_CLOSE_BRACKET)
3265         break;
3266     }
3267
3268   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3269
3270   /* If it is non-matching list.  */
3271   if (non_match)
3272     bitset_not (sbcset);
3273
3274 #ifdef RE_ENABLE_I18N
3275   /* Ensure only single byte characters are set.  */
3276   if (dfa->mb_cur_max > 1)
3277     bitset_mask (sbcset, dfa->sb_char);
3278
3279   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3280       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3281                                                      || mbcset->non_match)))
3282     {
3283       bin_tree_t *mbc_tree;
3284       int sbc_idx;
3285       /* Build a tree for complex bracket.  */
3286       dfa->has_mb_node = 1;
3287       br_token.type = COMPLEX_BRACKET;
3288       br_token.opr.mbcset = mbcset;
3289       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3290       if (BE (mbc_tree == NULL, 0))
3291         goto parse_bracket_exp_espace;
3292       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3293         if (sbcset[sbc_idx])
3294           break;
3295       /* If there are no bits set in sbcset, there is no point
3296          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3297       if (sbc_idx < BITSET_WORDS)
3298         {
3299           /* Build a tree for simple bracket.  */
3300           br_token.type = SIMPLE_BRACKET;
3301           br_token.opr.sbcset = sbcset;
3302           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3303           if (BE (work_tree == NULL, 0))
3304             goto parse_bracket_exp_espace;
3305
3306           /* Then join them by ALT node.  */
3307           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3308           if (BE (work_tree == NULL, 0))
3309             goto parse_bracket_exp_espace;
3310         }
3311       else
3312         {
3313           re_free (sbcset);
3314           work_tree = mbc_tree;
3315         }
3316     }
3317   else
3318 #endif /* not RE_ENABLE_I18N */
3319     {
3320 #ifdef RE_ENABLE_I18N
3321       free_charset (mbcset);
3322 #endif
3323       /* Build a tree for simple bracket.  */
3324       br_token.type = SIMPLE_BRACKET;
3325       br_token.opr.sbcset = sbcset;
3326       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3327       if (BE (work_tree == NULL, 0))
3328         goto parse_bracket_exp_espace;
3329     }
3330   return work_tree;
3331
3332  parse_bracket_exp_espace:
3333   *err = REG_ESPACE;
3334  parse_bracket_exp_free_return:
3335   re_free (sbcset);
3336 #ifdef RE_ENABLE_I18N
3337   free_charset (mbcset);
3338 #endif /* RE_ENABLE_I18N */
3339   return NULL;
3340 }
3341
3342 /* Parse an element in the bracket expression.  */
3343
3344 static reg_errcode_t
3345 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3346                        re_token_t *token, int token_len, re_dfa_t *dfa,
3347                        reg_syntax_t syntax, int accept_hyphen)
3348 {
3349 #ifdef RE_ENABLE_I18N
3350   int cur_char_size;
3351   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3352   if (cur_char_size > 1)
3353     {
3354       elem->type = MB_CHAR;
3355       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3356       re_string_skip_bytes (regexp, cur_char_size);
3357       return REG_NOERROR;
3358     }
3359 #endif /* RE_ENABLE_I18N */
3360   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3361   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3362       || token->type == OP_OPEN_EQUIV_CLASS)
3363     return parse_bracket_symbol (elem, regexp, token);
3364   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3365     {
3366       /* A '-' must only appear as anything but a range indicator before
3367          the closing bracket.  Everything else is an error.  */
3368       re_token_t token2;
3369       (void) peek_token_bracket (&token2, regexp, syntax);
3370       if (token2.type != OP_CLOSE_BRACKET)
3371         /* The actual error value is not standardized since this whole
3372            case is undefined.  But ERANGE makes good sense.  */
3373         return REG_ERANGE;
3374     }
3375   elem->type = SB_CHAR;
3376   elem->opr.ch = token->opr.c;
3377   return REG_NOERROR;
3378 }
3379
3380 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3381    such as [:<character_class>:], [.<collating_element>.], and
3382    [=<equivalent_class>=].  */
3383
3384 static reg_errcode_t
3385 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3386                       re_token_t *token)
3387 {
3388   unsigned char ch, delim = token->opr.c;
3389   int i = 0;
3390   if (re_string_eoi(regexp))
3391     return REG_EBRACK;
3392   for (;; ++i)
3393     {
3394       if (i >= BRACKET_NAME_BUF_SIZE)
3395         return REG_EBRACK;
3396       if (token->type == OP_OPEN_CHAR_CLASS)
3397         ch = re_string_fetch_byte_case (regexp);
3398       else
3399         ch = re_string_fetch_byte (regexp);
3400       if (re_string_eoi(regexp))
3401         return REG_EBRACK;
3402       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3403         break;
3404       elem->opr.name[i] = ch;
3405     }
3406   re_string_skip_bytes (regexp, 1);
3407   elem->opr.name[i] = '\0';
3408   switch (token->type)
3409     {
3410     case OP_OPEN_COLL_ELEM:
3411       elem->type = COLL_SYM;
3412       break;
3413     case OP_OPEN_EQUIV_CLASS:
3414       elem->type = EQUIV_CLASS;
3415       break;
3416     case OP_OPEN_CHAR_CLASS:
3417       elem->type = CHAR_CLASS;
3418       break;
3419     default:
3420       break;
3421     }
3422   return REG_NOERROR;
3423 }
3424
3425   /* Helper function for parse_bracket_exp.
3426      Build the equivalence class which is represented by NAME.
3427      The result are written to MBCSET and SBCSET.
3428      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3429      is a pointer argument since we may update it.  */
3430
3431 static reg_errcode_t
3432 #ifdef RE_ENABLE_I18N
3433 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3434                    int *equiv_class_alloc, const unsigned char *name)
3435 #else /* not RE_ENABLE_I18N */
3436 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3437 #endif /* not RE_ENABLE_I18N */
3438 {
3439 #ifdef _LIBC
3440   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3441   if (nrules != 0)
3442     {
3443       const int32_t *table, *indirect;
3444       const unsigned char *weights, *extra, *cp;
3445       unsigned char char_buf[2];
3446       int32_t idx1, idx2;
3447       unsigned int ch;
3448       size_t len;
3449       /* This #include defines a local function!  */
3450 # include <locale/weight.h>
3451       /* Calculate the index for equivalence class.  */
3452       cp = name;
3453       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3454       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3455                                                _NL_COLLATE_WEIGHTMB);
3456       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3457                                                    _NL_COLLATE_EXTRAMB);
3458       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3459                                                 _NL_COLLATE_INDIRECTMB);
3460       idx1 = findidx (&cp);
3461       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3462         /* This isn't a valid character.  */
3463         return REG_ECOLLATE;
3464
3465       /* Build single byte matching table for this equivalence class.  */
3466       char_buf[1] = (unsigned char) '\0';
3467       len = weights[idx1 & 0xffffff];
3468       for (ch = 0; ch < SBC_MAX; ++ch)
3469         {
3470           char_buf[0] = ch;
3471           cp = char_buf;
3472           idx2 = findidx (&cp);
3473 /*
3474           idx2 = table[ch];
3475 */
3476           if (idx2 == 0)
3477             /* This isn't a valid character.  */
3478             continue;
3479           /* Compare only if the length matches and the collation rule
3480              index is the same.  */
3481           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3482             {
3483               int cnt = 0;
3484
3485               while (cnt <= len &&
3486                      weights[(idx1 & 0xffffff) + 1 + cnt]
3487                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3488                 ++cnt;
3489
3490               if (cnt > len)
3491                 bitset_set (sbcset, ch);
3492             }
3493         }
3494       /* Check whether the array has enough space.  */
3495       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3496         {
3497           /* Not enough, realloc it.  */
3498           /* +1 in case of mbcset->nequiv_classes is 0.  */
3499           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3500           /* Use realloc since the array is NULL if *alloc == 0.  */
3501           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3502                                                    int32_t,
3503                                                    new_equiv_class_alloc);
3504           if (BE (new_equiv_classes == NULL, 0))
3505             return REG_ESPACE;
3506           mbcset->equiv_classes = new_equiv_classes;
3507           *equiv_class_alloc = new_equiv_class_alloc;
3508         }
3509       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3510     }
3511   else
3512 #endif /* _LIBC */
3513     {
3514       if (BE (strlen ((const char *) name) != 1, 0))
3515         return REG_ECOLLATE;
3516       bitset_set (sbcset, *name);
3517     }
3518   return REG_NOERROR;
3519 }
3520
3521   /* Helper function for parse_bracket_exp.
3522      Build the character class which is represented by NAME.
3523      The result are written to MBCSET and SBCSET.
3524      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3525      is a pointer argument since we may update it.  */
3526
3527 static reg_errcode_t
3528 #ifdef RE_ENABLE_I18N
3529 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3530                  re_charset_t *mbcset, int *char_class_alloc,
3531                  const char *class_name, reg_syntax_t syntax)
3532 #else /* not RE_ENABLE_I18N */
3533 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3534                  const char *class_name, reg_syntax_t syntax)
3535 #endif /* not RE_ENABLE_I18N */
3536 {
3537   int i;
3538
3539   /* In case of REG_ICASE "upper" and "lower" match the both of
3540      upper and lower cases.  */
3541   if ((syntax & RE_ICASE)
3542       && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3543     class_name = "alpha";
3544
3545 #ifdef RE_ENABLE_I18N
3546   /* Check the space of the arrays.  */
3547   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3548     {
3549       /* Not enough, realloc it.  */
3550       /* +1 in case of mbcset->nchar_classes is 0.  */
3551       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3552       /* Use realloc since array is NULL if *alloc == 0.  */
3553       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3554                                                new_char_class_alloc);
3555       if (BE (new_char_classes == NULL, 0))
3556         return REG_ESPACE;
3557       mbcset->char_classes = new_char_classes;
3558       *char_class_alloc = new_char_class_alloc;
3559     }
3560   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3561 #endif /* RE_ENABLE_I18N */
3562
3563 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3564   do {                                          \
3565     if (BE (trans != NULL, 0))                  \
3566       {                                         \
3567         for (i = 0; i < SBC_MAX; ++i)           \
3568           if (ctype_func (i))                   \
3569             bitset_set (sbcset, trans[i]);      \
3570       }                                         \
3571     else                                        \
3572       {                                         \
3573         for (i = 0; i < SBC_MAX; ++i)           \
3574           if (ctype_func (i))                   \
3575             bitset_set (sbcset, i);             \
3576       }                                         \
3577   } while (0)
3578
3579   if (strcmp (class_name, "alnum") == 0)
3580     BUILD_CHARCLASS_LOOP (isalnum);
3581   else if (strcmp (class_name, "cntrl") == 0)
3582     BUILD_CHARCLASS_LOOP (iscntrl);
3583   else if (strcmp (class_name, "lower") == 0)
3584     BUILD_CHARCLASS_LOOP (islower);
3585   else if (strcmp (class_name, "space") == 0)
3586     BUILD_CHARCLASS_LOOP (isspace);
3587   else if (strcmp (class_name, "alpha") == 0)
3588     BUILD_CHARCLASS_LOOP (isalpha);
3589   else if (strcmp (class_name, "digit") == 0)
3590     BUILD_CHARCLASS_LOOP (isdigit);
3591   else if (strcmp (class_name, "print") == 0)
3592     BUILD_CHARCLASS_LOOP (isprint);
3593   else if (strcmp (class_name, "upper") == 0)
3594     BUILD_CHARCLASS_LOOP (isupper);
3595   else if (strcmp (class_name, "blank") == 0)
3596 #ifndef GAWK
3597     BUILD_CHARCLASS_LOOP (isblank);
3598 #else
3599     /* see comments above */
3600     BUILD_CHARCLASS_LOOP (is_blank);
3601 #endif
3602   else if (strcmp (class_name, "graph") == 0)
3603     BUILD_CHARCLASS_LOOP (isgraph);
3604   else if (strcmp (class_name, "punct") == 0)
3605     BUILD_CHARCLASS_LOOP (ispunct);
3606   else if (strcmp (class_name, "xdigit") == 0)
3607     BUILD_CHARCLASS_LOOP (isxdigit);
3608   else
3609     return REG_ECTYPE;
3610
3611   return REG_NOERROR;
3612 }
3613
3614 static bin_tree_t *
3615 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3616                     const char *class_name,
3617                     const char *extra, int non_match,
3618                     reg_errcode_t *err)
3619 {
3620   re_bitset_ptr_t sbcset;
3621 #ifdef RE_ENABLE_I18N
3622   re_charset_t *mbcset;
3623   int alloc = 0;
3624 #endif /* not RE_ENABLE_I18N */
3625   reg_errcode_t ret;
3626   re_token_t br_token;
3627   bin_tree_t *tree;
3628
3629   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3630 #ifdef RE_ENABLE_I18N
3631   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3632 #endif /* RE_ENABLE_I18N */
3633
3634 #ifdef RE_ENABLE_I18N
3635   if (BE (sbcset == NULL || mbcset == NULL, 0))
3636 #else /* not RE_ENABLE_I18N */
3637   if (BE (sbcset == NULL, 0))
3638 #endif /* not RE_ENABLE_I18N */
3639     {
3640       *err = REG_ESPACE;
3641       return NULL;
3642     }
3643
3644   if (non_match)
3645     {
3646 #ifdef RE_ENABLE_I18N
3647       mbcset->non_match = 1;
3648 #endif /* not RE_ENABLE_I18N */
3649     }
3650
3651   /* We don't care the syntax in this case.  */
3652   ret = build_charclass (trans, sbcset,
3653 #ifdef RE_ENABLE_I18N
3654                          mbcset, &alloc,
3655 #endif /* RE_ENABLE_I18N */
3656                          class_name, 0);
3657
3658   if (BE (ret != REG_NOERROR, 0))
3659     {
3660       re_free (sbcset);
3661 #ifdef RE_ENABLE_I18N
3662       free_charset (mbcset);
3663 #endif /* RE_ENABLE_I18N */
3664       *err = ret;
3665       return NULL;
3666     }
3667   /* \w match '_' also.  */
3668   for (; *extra; extra++)
3669     bitset_set (sbcset, *extra);
3670
3671   /* If it is non-matching list.  */
3672   if (non_match)
3673     bitset_not (sbcset);
3674
3675 #ifdef RE_ENABLE_I18N
3676   /* Ensure only single byte characters are set.  */
3677   if (dfa->mb_cur_max > 1)
3678     bitset_mask (sbcset, dfa->sb_char);
3679 #endif
3680
3681   /* Build a tree for simple bracket.  */
3682   br_token.type = SIMPLE_BRACKET;
3683   br_token.opr.sbcset = sbcset;
3684   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3685   if (BE (tree == NULL, 0))
3686     goto build_word_op_espace;
3687
3688 #ifdef RE_ENABLE_I18N
3689   if (dfa->mb_cur_max > 1)
3690     {
3691       bin_tree_t *mbc_tree;
3692       /* Build a tree for complex bracket.  */
3693       br_token.type = COMPLEX_BRACKET;
3694       br_token.opr.mbcset = mbcset;
3695       dfa->has_mb_node = 1;
3696       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3697       if (BE (mbc_tree == NULL, 0))
3698         goto build_word_op_espace;
3699       /* Then join them by ALT node.  */
3700       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3701       if (BE (mbc_tree != NULL, 1))
3702         return tree;
3703     }
3704   else
3705     {
3706       free_charset (mbcset);
3707       return tree;
3708     }
3709 #else /* not RE_ENABLE_I18N */
3710   return tree;
3711 #endif /* not RE_ENABLE_I18N */
3712
3713  build_word_op_espace:
3714   re_free (sbcset);
3715 #ifdef RE_ENABLE_I18N
3716   free_charset (mbcset);
3717 #endif /* RE_ENABLE_I18N */
3718   *err = REG_ESPACE;
3719   return NULL;
3720 }
3721
3722 /* This is intended for the expressions like "a{1,3}".
3723    Fetch a number from `input', and return the number.
3724    Return -1, if the number field is empty like "{,1}".
3725    Return -2, if an error has occurred.  */
3726
3727 static int
3728 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3729 {
3730   int num = -1;
3731   unsigned char c;
3732   while (1)
3733     {
3734       fetch_token (token, input, syntax);
3735       c = token->opr.c;
3736       if (BE (token->type == END_OF_RE, 0))
3737         return -2;
3738       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3739         break;
3740       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3741              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3742       num = (num > RE_DUP_MAX) ? -2 : num;
3743     }
3744   return num;
3745 }
3746 \f
3747 #ifdef RE_ENABLE_I18N
3748 static void
3749 free_charset (re_charset_t *cset)
3750 {
3751   re_free (cset->mbchars);
3752 # ifdef _LIBC
3753   re_free (cset->coll_syms);
3754   re_free (cset->equiv_classes);
3755   re_free (cset->range_starts);
3756   re_free (cset->range_ends);
3757 # endif
3758   re_free (cset->char_classes);
3759   re_free (cset);
3760 }
3761 #endif /* RE_ENABLE_I18N */
3762 \f
3763 /* Functions for binary tree operation.  */
3764
3765 /* Create a tree node.  */
3766
3767 static bin_tree_t *
3768 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3769              re_token_type_t type)
3770 {
3771   re_token_t t;
3772   t.type = type;
3773   return create_token_tree (dfa, left, right, &t);
3774 }
3775
3776 static bin_tree_t *
3777 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3778                    const re_token_t *token)
3779 {
3780   bin_tree_t *tree;
3781   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3782     {
3783       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3784
3785       if (storage == NULL)
3786         return NULL;
3787       storage->next = dfa->str_tree_storage;
3788       dfa->str_tree_storage = storage;
3789       dfa->str_tree_storage_idx = 0;
3790     }
3791   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3792
3793   tree->parent = NULL;
3794   tree->left = left;
3795   tree->right = right;
3796   tree->token = *token;
3797   tree->token.duplicated = 0;
3798   tree->token.opt_subexp = 0;
3799   tree->first = NULL;
3800   tree->next = NULL;
3801   tree->node_idx = -1;
3802
3803   if (left != NULL)
3804     left->parent = tree;
3805   if (right != NULL)
3806     right->parent = tree;
3807   return tree;
3808 }
3809
3810 /* Mark the tree SRC as an optional subexpression.
3811    To be called from preorder or postorder.  */
3812
3813 static reg_errcode_t
3814 mark_opt_subexp (void *extra, bin_tree_t *node)
3815 {
3816   int idx = (int) (intptr_t) extra;
3817   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3818     node->token.opt_subexp = 1;
3819
3820   return REG_NOERROR;
3821 }
3822
3823 /* Free the allocated memory inside NODE. */
3824
3825 static void
3826 free_token (re_token_t *node)
3827 {
3828 #ifdef RE_ENABLE_I18N
3829   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3830     free_charset (node->opr.mbcset);
3831   else
3832 #endif /* RE_ENABLE_I18N */
3833     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3834       re_free (node->opr.sbcset);
3835 }
3836
3837 /* Worker function for tree walking.  Free the allocated memory inside NODE
3838    and its children. */
3839
3840 static reg_errcode_t
3841 free_tree (void *extra, bin_tree_t *node)
3842 {
3843   free_token (&node->token);
3844   return REG_NOERROR;
3845 }
3846
3847
3848 /* Duplicate the node SRC, and return new node.  This is a preorder
3849    visit similar to the one implemented by the generic visitor, but
3850    we need more infrastructure to maintain two parallel trees --- so,
3851    it's easier to duplicate.  */
3852
3853 static bin_tree_t *
3854 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3855 {
3856   const bin_tree_t *node;
3857   bin_tree_t *dup_root;
3858   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3859
3860   for (node = root; ; )
3861     {
3862       /* Create a new tree and link it back to the current parent.  */
3863       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3864       if (*p_new == NULL)
3865         return NULL;
3866       (*p_new)->parent = dup_node;
3867       (*p_new)->token.duplicated = 1;
3868       dup_node = *p_new;
3869
3870       /* Go to the left node, or up and to the right.  */
3871       if (node->left)
3872         {
3873           node = node->left;
3874           p_new = &dup_node->left;
3875         }
3876       else
3877         {
3878           const bin_tree_t *prev = NULL;
3879           while (node->right == prev || node->right == NULL)
3880             {
3881               prev = node;
3882               node = node->parent;
3883               dup_node = dup_node->parent;
3884               if (!node)
3885                 return dup_root;
3886             }
3887           node = node->right;
3888           p_new = &dup_node->right;
3889         }
3890     }
3891 }