2 * Copyright 2008 Jacek Caban for CodeWeavers
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 * Code in this file is based on files:
23 * from Mozilla project, released under LGPL 2.1 or later.
25 * The Original Code is Mozilla Communicator client code, released
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
39 #include "wine/debug.h"
41 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
43 /* FIXME: Better error handling */
44 #define ReportRegExpError(a,b,c)
45 #define ReportRegExpErrorHelper(a,b,c,d)
46 #define JS_ReportErrorNumber(a,b,c,d)
47 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
48 #define js_ReportOutOfScriptQuota(a)
49 #define JS_ReportOutOfMemory(a)
50 #define JS_COUNT_OPERATION(a,b)
53 typedef BYTE JSPackedBool;
56 * This struct holds a bitmap representation of a class from a regexp.
57 * There's a list of these referenced by the classList field in the regexp_t
58 * struct below. The initial state has startIndex set to the offset in the
59 * original regexp source of the beginning of the class contents. The first
60 * use of the class converts the source representation into a bitmap.
63 typedef struct RECharSet {
64 JSPackedBool converted;
76 #define JSMSG_MIN_TOO_BIG 47
77 #define JSMSG_MAX_TOO_BIG 48
78 #define JSMSG_OUT_OF_ORDER 49
79 #define JSMSG_OUT_OF_MEMORY 137
81 #define LINE_SEPARATOR 0x2028
82 #define PARA_SEPARATOR 0x2029
84 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
85 ((c >= 'a') && (c <= 'z')) )
86 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
87 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
89 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
91 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
92 #define JS7_UNDEC(c) ((c) - '0')
144 REOP_LIMIT /* META: no operator >= to this */
147 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
149 static const char *reop_names[] = {
202 typedef struct REProgState {
203 jsbytecode *continue_pc; /* current continuation data */
204 jsbytecode continue_op;
205 ptrdiff_t index; /* progress in text */
206 size_t parenSoFar; /* highest indexed paren started */
209 UINT min; /* current quantifier limits */
213 size_t top; /* backtrack stack state */
219 typedef struct REBackTrackData {
220 size_t sz; /* size of previous stack entry */
221 jsbytecode *backtrack_pc; /* where to backtrack to */
222 jsbytecode backtrack_op;
223 const WCHAR *cp; /* index in text of match at backtrack */
224 size_t parenIndex; /* start index of saved paren contents */
225 size_t parenCount; /* # of saved paren contents */
226 size_t saveStateStackTop; /* number of parent states */
227 /* saved parent states follow */
228 /* saved paren contents follow */
231 #define INITIAL_STATESTACK 100
232 #define INITIAL_BACKTRACK 8000
234 typedef struct REGlobalData {
236 regexp_t *regexp; /* the RE in execution */
237 BOOL ok; /* runtime error (out_of_memory only?) */
238 size_t start; /* offset to start at */
239 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
240 const WCHAR *cpbegin; /* text base address */
241 const WCHAR *cpend; /* text limit address */
243 REProgState *stateStack; /* stack of state of current parents */
244 size_t stateStackTop;
245 size_t stateStackLimit;
247 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
248 REBackTrackData *backTrackSP;
249 size_t backTrackStackSize;
250 size_t cursz; /* size of current stack entry */
251 size_t backTrackCount; /* how many times we've backtracked */
252 size_t backTrackLimit; /* upper limit on backtrack states */
254 heap_pool_t *pool; /* It's faster to use one malloc'd pool
255 than to malloc/free the three items
256 that are allocated from this pool */
259 typedef struct RENode RENode;
261 REOp op; /* r.e. op bytecode */
262 RENode *next; /* next in concatenation order */
263 void *kid; /* first operand */
265 void *kid2; /* second operand */
266 INT num; /* could be a number */
267 size_t parenIndex; /* or a parenthesis index */
268 struct { /* or a quantifier range */
273 struct { /* or a character class */
275 size_t kidlen; /* length of string at kid, in jschars */
276 size_t index; /* index into class list */
277 WORD bmsize; /* bitmap size, based on max char code */
280 struct { /* or a literal sequence */
281 WCHAR chr; /* of one character */
282 size_t length; /* or many (via the kid) */
285 RENode *kid2; /* second operand from ALT */
286 WCHAR ch1; /* match char for ALTPREREQ */
287 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
292 #define CLASS_CACHE_SIZE 4
294 typedef struct CompilerState {
296 const WCHAR *cpbegin;
300 size_t classCount; /* number of [] encountered */
301 size_t treeDepth; /* maximum depth of parse tree */
302 size_t progLength; /* estimated bytecode length */
304 size_t classBitmapsMem; /* memory to hold all class bitmaps */
306 const WCHAR *start; /* small cache of class strings */
307 size_t length; /* since they're often the same */
309 } classCache[CLASS_CACHE_SIZE];
312 heap_pool_t *pool; /* It's faster to use one malloc'd pool
313 than to malloc/free */
316 typedef struct EmitStateStackEntry {
317 jsbytecode *altHead; /* start of REOP_ALT* opcode */
318 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
319 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
320 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
321 RENode *continueNode; /* original REOP_ALT* node being stacked */
322 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
323 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
324 avoid 16-bit unsigned offset overflow */
325 } EmitStateStackEntry;
328 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
329 * the getters and setters take the pc of the offset, not of the opcode before
333 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
334 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
335 (pc)[1] = (jsbytecode) (arg))
337 #define OFFSET_LEN ARG_LEN
338 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
339 #define GET_OFFSET(pc) GET_ARG(pc)
341 static BOOL ParseRegExp(CompilerState*);
344 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
345 * For sanity, we limit it to 2^24 bytes.
347 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
350 * The maximum memory that can be allocated for class bitmaps.
351 * For sanity, we limit it to 2^24 bytes.
353 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
356 * Functions to get size and write/read bytecode that represent small indexes
358 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
359 * indicates that the following byte brings more bits to the index. Otherwise
360 * this is the last byte in the index bytecode representing highest index bits.
363 GetCompactIndexWidth(size_t index)
367 for (width = 1; (index >>= 7) != 0; ++width) { }
371 static inline jsbytecode *
372 WriteCompactIndex(jsbytecode *pc, size_t index)
376 while ((next = index >> 7) != 0) {
377 *pc++ = (jsbytecode)(index | 0x80);
380 *pc++ = (jsbytecode)index;
384 static inline jsbytecode *
385 ReadCompactIndex(jsbytecode *pc, size_t *result)
390 if ((nextByte & 0x80) == 0) {
392 * Short-circuit the most common case when compact index <= 127.
397 *result = 0x7F & nextByte;
400 *result |= (nextByte & 0x7F) << shift;
402 } while ((nextByte & 0x80) != 0);
407 /* Construct and initialize an RENode, returning NULL for out-of-memory */
409 NewRENode(CompilerState *state, REOp op)
413 ren = heap_pool_alloc(state->pool, sizeof(*ren));
415 /* js_ReportOutOfScriptQuota(cx); */
425 * Validates and converts hex ascii value.
428 isASCIIHexDigit(WCHAR c, UINT *digit)
439 if (cv >= 'a' && cv <= 'f') {
440 *digit = cv - 'a' + 10;
452 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
453 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
456 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
458 ptrdiff_t offset = target - jump;
460 /* Check that target really points forward. */
462 if ((size_t)offset > OFFSET_MAX)
465 jump[0] = JUMP_OFFSET_HI(offset);
466 jump[1] = JUMP_OFFSET_LO(offset);
471 * Generate bytecode for the tree rooted at t using an explicit stack instead
475 EmitREBytecode(CompilerState *state, regexp_t *re, size_t treeDepth,
476 jsbytecode *pc, RENode *t)
478 EmitStateStackEntry *emitStateSP, *emitStateStack;
482 if (treeDepth == 0) {
483 emitStateStack = NULL;
485 emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
489 emitStateSP = emitStateStack;
491 assert(op < REOP_LIMIT);
500 case REOP_ALTPREREQ2:
503 emitStateSP->altHead = pc - 1;
504 emitStateSP->endTermFixup = pc;
506 SET_ARG(pc, t->u.altprereq.ch1);
508 SET_ARG(pc, t->u.altprereq.ch2);
511 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
514 emitStateSP->continueNode = t;
515 emitStateSP->continueOp = REOP_JUMP;
516 emitStateSP->jumpToJumpFlag = FALSE;
518 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
521 assert(op < REOP_LIMIT);
525 emitStateSP->nextTermFixup = pc; /* offset to following term */
527 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
529 emitStateSP->continueOp = REOP_ENDALT;
531 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
534 assert(op < REOP_LIMIT);
539 * If we already patched emitStateSP->nextTermFixup to jump to
540 * a nearer jump, to avoid 16-bit immediate offset overflow, we
543 if (emitStateSP->jumpToJumpFlag)
547 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
548 * REOP_ENDALT is executed only on successful match of the last
549 * alternate in a group.
551 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
553 if (t->op != REOP_ALT) {
554 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
559 * If the program is bigger than the REOP_JUMP offset range, then
560 * we must check for alternates before this one that are part of
561 * the same group, and fix up their jump offsets to target jumps
562 * close enough to fit in a 16-bit unsigned offset immediate.
564 if ((size_t)(pc - re->program) > OFFSET_MAX &&
565 emitStateSP > emitStateStack) {
566 EmitStateStackEntry *esp, *esp2;
567 jsbytecode *alt, *jump;
568 ptrdiff_t span, header;
572 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
573 if (esp->continueOp == REOP_ENDALT &&
574 !esp->jumpToJumpFlag &&
575 esp->nextTermFixup + OFFSET_LEN == alt &&
576 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
578 : esp->nextTermFixup)) > OFFSET_MAX) {
580 jump = esp->nextTermFixup;
583 * The span must be 1 less than the distance from
584 * jump offset to jump offset, so we actually jump
585 * to a REOP_JUMP bytecode, not to its offset!
588 assert(jump < esp2->nextTermFixup);
589 span = esp2->nextTermFixup - jump - 1;
590 if ((size_t)span <= OFFSET_MAX)
595 } while (esp2->continueOp != REOP_ENDALT);
598 jump[0] = JUMP_OFFSET_HI(span);
599 jump[1] = JUMP_OFFSET_LO(span);
601 if (esp->continueNode->op != REOP_ALT) {
603 * We must patch the offset at esp->endTermFixup
604 * as well, for the REOP_ALTPREREQ{,2} opcodes.
605 * If we're unlucky and endTermFixup is more than
606 * OFFSET_MAX bytes from its target, we cheat by
607 * jumping 6 bytes to the jump whose offset is at
608 * esp->nextTermFixup, which has the same target.
610 jump = esp->endTermFixup;
611 header = esp->nextTermFixup - jump;
613 if ((size_t)span > OFFSET_MAX)
616 jump[0] = JUMP_OFFSET_HI(span);
617 jump[1] = JUMP_OFFSET_LO(span);
620 esp->jumpToJumpFlag = TRUE;
628 emitStateSP->altHead = pc - 1;
629 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
631 emitStateSP->continueNode = t;
632 emitStateSP->continueOp = REOP_JUMP;
633 emitStateSP->jumpToJumpFlag = FALSE;
635 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
638 assert(op < REOP_LIMIT);
643 * Coalesce FLATs if possible and if it would not increase bytecode
644 * beyond preallocated limit. The latter happens only when bytecode
645 * size for coalesced string with offset p and length 2 exceeds 6
646 * bytes preallocated for 2 single char nodes, i.e. when
647 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
648 * GetCompactIndexWidth(p) > 4.
649 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
650 * nodes strictly decreases bytecode size, the check has to be
651 * done only for the first coalescing.
654 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
657 t->next->op == REOP_FLAT &&
658 (WCHAR*)t->kid + t->u.flat.length ==
660 t->u.flat.length += t->next->u.flat.length;
661 t->next = t->next->next;
664 if (t->kid && t->u.flat.length > 1) {
665 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLATi : REOP_FLAT;
666 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
667 pc = WriteCompactIndex(pc, t->u.flat.length);
668 } else if (t->u.flat.chr < 256) {
669 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
670 *pc++ = (jsbytecode) t->u.flat.chr;
672 pc[-1] = (state->flags & REG_FOLD)
675 SET_ARG(pc, t->u.flat.chr);
682 pc = WriteCompactIndex(pc, t->u.parenIndex);
683 emitStateSP->continueNode = t;
684 emitStateSP->continueOp = REOP_RPAREN;
686 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
692 pc = WriteCompactIndex(pc, t->u.parenIndex);
696 pc = WriteCompactIndex(pc, t->u.parenIndex);
701 emitStateSP->nextTermFixup = pc;
703 emitStateSP->continueNode = t;
704 emitStateSP->continueOp = REOP_ASSERTTEST;
706 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
711 case REOP_ASSERTTEST:
712 case REOP_ASSERTNOTTEST:
713 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
717 case REOP_ASSERT_NOT:
719 emitStateSP->nextTermFixup = pc;
721 emitStateSP->continueNode = t;
722 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
724 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
731 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
732 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
733 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
734 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
735 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
736 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
738 if (!t->u.range.greedy)
739 pc[-1] = REOP_MINIMALQUANT;
740 pc = WriteCompactIndex(pc, t->u.range.min);
742 * Write max + 1 to avoid using size_t(max) + 1 bytes
743 * for (UINT)-1 sentinel.
745 pc = WriteCompactIndex(pc, t->u.range.max + 1);
747 emitStateSP->nextTermFixup = pc;
749 emitStateSP->continueNode = t;
750 emitStateSP->continueOp = REOP_ENDCHILD;
752 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
758 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
763 if (!t->u.ucclass.sense)
764 pc[-1] = REOP_NCLASS;
765 pc = WriteCompactIndex(pc, t->u.ucclass.index);
766 charSet = &re->classList[t->u.ucclass.index];
767 charSet->converted = FALSE;
768 charSet->length = t->u.ucclass.bmsize;
769 charSet->u.src.startIndex = t->u.ucclass.startIndex;
770 charSet->u.src.length = t->u.ucclass.kidlen;
771 charSet->sense = t->u.ucclass.sense;
782 if (emitStateSP == emitStateStack)
785 t = emitStateSP->continueNode;
786 op = (REOp) emitStateSP->continueOp;
791 heap_free(emitStateStack);
795 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
801 * Process the op against the two top operands, reducing them to a single
802 * operand in the penultimate slot. Update progLength and treeDepth.
805 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
810 switch (opData->op) {
812 result = NewRENode(state, REOP_ALT);
815 result->kid = operandStack[operandSP - 2];
816 result->u.kid2 = operandStack[operandSP - 1];
817 operandStack[operandSP - 2] = result;
819 if (state->treeDepth == TREE_DEPTH_MAX) {
820 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
826 * Look at both alternates to see if there's a FLAT or a CLASS at
827 * the start of each. If so, use a prerequisite match.
829 if (((RENode *) result->kid)->op == REOP_FLAT &&
830 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
831 (state->flags & REG_FOLD) == 0) {
832 result->op = REOP_ALTPREREQ;
833 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
834 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
835 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
836 JUMP, <end> ... ENDALT */
837 state->progLength += 13;
840 if (((RENode *) result->kid)->op == REOP_CLASS &&
841 ((RENode *) result->kid)->u.ucclass.index < 256 &&
842 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
843 (state->flags & REG_FOLD) == 0) {
844 result->op = REOP_ALTPREREQ2;
845 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
846 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
847 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
848 JUMP, <end> ... ENDALT */
849 state->progLength += 13;
852 if (((RENode *) result->kid)->op == REOP_FLAT &&
853 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
854 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
855 (state->flags & REG_FOLD) == 0) {
856 result->op = REOP_ALTPREREQ2;
857 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
858 result->u.altprereq.ch2 =
859 ((RENode *) result->u.kid2)->u.ucclass.index;
860 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
861 JUMP, <end> ... ENDALT */
862 state->progLength += 13;
865 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
866 state->progLength += 7;
871 result = operandStack[operandSP - 2];
873 result = result->next;
874 result->next = operandStack[operandSP - 1];
878 case REOP_ASSERT_NOT:
881 /* These should have been processed by a close paren. */
882 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
892 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
893 * its being on the stack, and to propagate errors to its callers.
895 #define JSREG_FIND_PAREN_COUNT 0x8000
896 #define JSREG_FIND_PAREN_ERROR 0x4000
899 * Magic return value from FindParenCount and GetDecimalValue, to indicate
900 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
901 * its findMax parameter is non-null.
903 #define OVERFLOW_VALUE ((UINT)-1)
906 FindParenCount(CompilerState *state)
911 if (state->flags & JSREG_FIND_PAREN_COUNT)
912 return OVERFLOW_VALUE;
915 * Copy state into temp, flag it so we never report an invalid backref,
916 * and reset its members to parse the entire regexp. This is obviously
917 * suboptimal, but GetDecimalValue calls us only if a backref appears to
918 * refer to a forward parenthetical, which is rare.
921 temp.flags |= JSREG_FIND_PAREN_COUNT;
922 temp.cp = temp.cpbegin;
927 temp.classBitmapsMem = 0;
928 for (i = 0; i < CLASS_CACHE_SIZE; i++)
929 temp.classCache[i].start = NULL;
931 if (!ParseRegExp(&temp)) {
932 state->flags |= JSREG_FIND_PAREN_ERROR;
933 return OVERFLOW_VALUE;
935 return temp.parenCount;
939 * Extract and return a decimal value at state->cp. The initial character c
940 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
941 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
942 * state->flags to discover whether an error occurred under findMax.
945 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
946 CompilerState *state)
948 UINT value = JS7_UNDEC(c);
949 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
951 /* The following restriction allows simpler overflow checks. */
952 assert(max <= ((UINT)-1 - 9) / 10);
953 while (state->cp < state->cpend) {
957 value = 10 * value + JS7_UNDEC(c);
958 if (!overflow && value > max && (!findMax || value > findMax(state)))
962 return overflow ? OVERFLOW_VALUE : value;
966 * Calculate the total size of the bitmap required for a class expression.
969 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
973 BOOL inRange = FALSE;
974 WCHAR c, rangeStart = 0;
975 UINT n, digit, nDigits, i;
977 target->u.ucclass.bmsize = 0;
978 target->u.ucclass.sense = TRUE;
985 target->u.ucclass.sense = FALSE;
989 BOOL canStartRange = TRUE;
1016 if (src < end && RE_IS_LETTER(*src)) {
1017 localMax = (UINT) (*src++) & 0x1F;
1030 for (i = 0; (i < nDigits) && (src < end); i++) {
1032 if (!isASCIIHexDigit(c, &digit)) {
1034 * Back off to accepting the original
1041 n = (n << 4) | digit;
1046 canStartRange = FALSE;
1048 JS_ReportErrorNumber(state->context,
1049 js_GetErrorMessage, NULL,
1050 JSMSG_BAD_CLASS_RANGE);
1060 canStartRange = FALSE;
1062 JS_ReportErrorNumber(state->context,
1063 js_GetErrorMessage, NULL,
1064 JSMSG_BAD_CLASS_RANGE);
1070 * If this is the start of a range, ensure that it's less than
1084 * This is a non-ECMA extension - decimal escapes (in this
1085 * case, octal!) are supposed to be an error inside class
1086 * ranges, but supported here for backwards compatibility.
1091 if ('0' <= c && c <= '7') {
1093 n = 8 * n + JS7_UNDEC(c);
1095 if ('0' <= c && c <= '7') {
1097 i = 8 * n + JS7_UNDEC(c);
1118 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1119 if (rangeStart > localMax) {
1120 JS_ReportErrorNumber(state->context,
1121 js_GetErrorMessage, NULL,
1122 JSMSG_BAD_CLASS_RANGE);
1127 if (canStartRange && src < end - 1) {
1131 rangeStart = (WCHAR)localMax;
1135 if (state->flags & REG_FOLD)
1136 rangeStart = localMax; /* one run of the uc/dc loop below */
1139 if (state->flags & REG_FOLD) {
1140 WCHAR maxch = localMax;
1142 for (i = rangeStart; i <= localMax; i++) {
1158 target->u.ucclass.bmsize = max;
1163 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1167 const WCHAR *errp = state->cp++;
1172 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1175 if (!ignoreValues && min == OVERFLOW_VALUE)
1176 return JSMSG_MIN_TOO_BIG;
1182 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1184 if (!ignoreValues && max == OVERFLOW_VALUE)
1185 return JSMSG_MAX_TOO_BIG;
1186 if (!ignoreValues && min > max)
1187 return JSMSG_OUT_OF_ORDER;
1195 state->result = NewRENode(state, REOP_QUANT);
1197 return JSMSG_OUT_OF_MEMORY;
1198 state->result->u.range.min = min;
1199 state->result->u.range.max = max;
1201 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1202 * where <max> is written as compact(max+1) to make
1203 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1205 state->progLength += (1 + GetCompactIndexWidth(min)
1206 + GetCompactIndexWidth(max + 1)
1217 ParseQuantifier(CompilerState *state)
1220 term = state->result;
1221 if (state->cp < state->cpend) {
1222 switch (*state->cp) {
1224 state->result = NewRENode(state, REOP_QUANT);
1227 state->result->u.range.min = 1;
1228 state->result->u.range.max = (UINT)-1;
1229 /* <PLUS>, <next> ... <ENDCHILD> */
1230 state->progLength += 4;
1233 state->result = NewRENode(state, REOP_QUANT);
1236 state->result->u.range.min = 0;
1237 state->result->u.range.max = (UINT)-1;
1238 /* <STAR>, <next> ... <ENDCHILD> */
1239 state->progLength += 4;
1242 state->result = NewRENode(state, REOP_QUANT);
1245 state->result->u.range.min = 0;
1246 state->result->u.range.max = 1;
1247 /* <OPT>, <next> ... <ENDCHILD> */
1248 state->progLength += 4;
1250 case '{': /* balance '}' */
1254 err = ParseMinMaxQuantifier(state, FALSE);
1260 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1269 if (state->treeDepth == TREE_DEPTH_MAX) {
1270 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1276 state->result->kid = term;
1277 if (state->cp < state->cpend && *state->cp == '?') {
1279 state->result->u.range.greedy = FALSE;
1281 state->result->u.range.greedy = TRUE;
1287 * item: assertion An item is either an assertion or
1288 * quantatom a quantified atom.
1290 * assertion: '^' Assertions match beginning of string
1291 * (or line if the class static property
1292 * RegExp.multiline is true).
1293 * '$' End of string (or line if the class
1294 * static property RegExp.multiline is
1296 * '\b' Word boundary (between \w and \W).
1297 * '\B' Word non-boundary.
1299 * quantatom: atom An unquantified atom.
1300 * quantatom '{' n ',' m '}'
1301 * Atom must occur between n and m times.
1302 * quantatom '{' n ',' '}' Atom must occur at least n times.
1303 * quantatom '{' n '}' Atom must occur exactly n times.
1304 * quantatom '*' Zero or more times (same as {0,}).
1305 * quantatom '+' One or more times (same as {1,}).
1306 * quantatom '?' Zero or one time (same as {0,1}).
1308 * any of which can be optionally followed by '?' for ungreedy
1310 * atom: '(' regexp ')' A parenthesized regexp (what matched
1311 * can be addressed using a backreference,
1313 * '.' Matches any char except '\n'.
1314 * '[' classlist ']' A character class.
1315 * '[' '^' classlist ']' A negated character class.
1317 * '\n' Newline (Line Feed).
1318 * '\r' Carriage Return.
1319 * '\t' Horizontal Tab.
1320 * '\v' Vertical Tab.
1321 * '\d' A digit (same as [0-9]).
1323 * '\w' A word character, [0-9a-z_A-Z].
1324 * '\W' A non-word character.
1325 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1326 * '\S' A non-whitespace character.
1327 * '\' n A backreference to the nth (n decimal
1328 * and positive) parenthesized expression.
1329 * '\' octal An octal escape sequence (octal must be
1330 * two or three digits long, unless it is
1331 * 0 for the null character).
1332 * '\x' hex A hex escape (hex must be two digits).
1333 * '\u' unicode A unicode escape (must be four digits).
1334 * '\c' ctrl A control character, ctrl is a letter.
1335 * '\' literalatomchar Any character except one of the above
1336 * that follow '\' in an atom.
1337 * otheratomchar Any character not first among the other
1338 * atom right-hand sides.
1341 ParseTerm(CompilerState *state)
1343 WCHAR c = *state->cp++;
1345 UINT num, tmp, n, i;
1346 const WCHAR *termStart;
1349 /* assertions and atoms */
1351 state->result = NewRENode(state, REOP_BOL);
1354 state->progLength++;
1357 state->result = NewRENode(state, REOP_EOL);
1360 state->progLength++;
1363 if (state->cp >= state->cpend) {
1364 /* a trailing '\' is an error */
1365 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1370 /* assertion escapes */
1372 state->result = NewRENode(state, REOP_WBDRY);
1375 state->progLength++;
1378 state->result = NewRENode(state, REOP_WNONBDRY);
1381 state->progLength++;
1383 /* Decimal escape */
1385 /* Give a strict warning. See also the note below. */
1386 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1389 while (state->cp < state->cpend) {
1391 if (c < '0' || '7' < c)
1394 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1401 state->result = NewRENode(state, REOP_FLAT);
1404 state->result->u.flat.chr = c;
1405 state->result->u.flat.length = 1;
1406 state->progLength += 3;
1417 termStart = state->cp - 1;
1418 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1419 if (state->flags & JSREG_FIND_PAREN_ERROR)
1421 if (num == OVERFLOW_VALUE) {
1422 /* Give a strict mode warning. */
1423 WARN("back-reference exceeds number of capturing parentheses\n");
1426 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1427 * error here. However, for compatibility with IE, we treat the
1428 * whole backref as flat if the first character in it is not a
1429 * valid octal character, and as an octal escape otherwise.
1431 state->cp = termStart;
1433 /* Treat this as flat. termStart - 1 is the \. */
1438 /* Treat this as an octal escape. */
1441 assert(1 <= num && num <= 0x10000);
1442 state->result = NewRENode(state, REOP_BACKREF);
1445 state->result->u.parenIndex = num - 1;
1447 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1449 /* Control escape */
1465 /* Control letter */
1467 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1468 c = (WCHAR) (*state->cp++ & 0x1F);
1470 /* back off to accepting the original '\' as a literal */
1475 /* HexEscapeSequence */
1479 /* UnicodeEscapeSequence */
1484 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1487 if (!isASCIIHexDigit(c, &digit)) {
1489 * Back off to accepting the original 'u' or 'x' as a
1496 n = (n << 4) | digit;
1500 /* Character class escapes */
1502 state->result = NewRENode(state, REOP_DIGIT);
1506 state->progLength++;
1509 state->result = NewRENode(state, REOP_NONDIGIT);
1512 state->result = NewRENode(state, REOP_SPACE);
1515 state->result = NewRENode(state, REOP_NONSPACE);
1518 state->result = NewRENode(state, REOP_ALNUM);
1521 state->result = NewRENode(state, REOP_NONALNUM);
1523 /* IdentityEscape */
1525 state->result = NewRENode(state, REOP_FLAT);
1528 state->result->u.flat.chr = c;
1529 state->result->u.flat.length = 1;
1530 state->result->kid = (void *) (state->cp - 1);
1531 state->progLength += 3;
1536 state->result = NewRENode(state, REOP_CLASS);
1539 termStart = state->cp;
1540 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1542 if (state->cp == state->cpend) {
1543 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1544 JSMSG_UNTERM_CLASS, termStart);
1548 if (*state->cp == '\\') {
1550 if (state->cp != state->cpend)
1554 if (*state->cp == ']') {
1555 state->result->u.ucclass.kidlen = state->cp - termStart;
1560 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1561 if (!state->classCache[i].start) {
1562 state->classCache[i].start = termStart;
1563 state->classCache[i].length = state->result->u.ucclass.kidlen;
1564 state->classCache[i].index = state->classCount;
1567 if (state->classCache[i].length ==
1568 state->result->u.ucclass.kidlen) {
1569 for (n = 0; ; n++) {
1570 if (n == state->classCache[i].length) {
1571 state->result->u.ucclass.index
1572 = state->classCache[i].index;
1575 if (state->classCache[i].start[n] != termStart[n])
1580 state->result->u.ucclass.index = state->classCount++;
1584 * Call CalculateBitmapSize now as we want any errors it finds
1585 * to be reported during the parse phase, not at execution.
1587 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1590 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1591 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1592 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1594 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1595 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1596 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1599 state->classBitmapsMem += n;
1600 /* CLASS, <index> */
1602 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1606 state->result = NewRENode(state, REOP_DOT);
1611 const WCHAR *errp = state->cp--;
1614 err = ParseMinMaxQuantifier(state, TRUE);
1625 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1626 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1630 state->result = NewRENode(state, REOP_FLAT);
1633 state->result->u.flat.chr = c;
1634 state->result->u.flat.length = 1;
1635 state->result->kid = (void *) (state->cp - 1);
1636 state->progLength += 3;
1639 return ParseQuantifier(state);
1643 * Top-down regular expression grammar, based closely on Perl4.
1645 * regexp: altern A regular expression is one or more
1646 * altern '|' regexp alternatives separated by vertical bar.
1648 #define INITIAL_STACK_SIZE 128
1651 ParseRegExp(CompilerState *state)
1655 REOpData *operatorStack;
1656 RENode **operandStack;
1659 BOOL result = FALSE;
1661 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1662 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1664 /* Watch out for empty regexp */
1665 if (state->cp == state->cpend) {
1666 state->result = NewRENode(state, REOP_EMPTY);
1667 return (state->result != NULL);
1670 operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1674 operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1679 parenIndex = state->parenCount;
1680 if (state->cp == state->cpend) {
1682 * If we are at the end of the regexp and we're short one or more
1683 * operands, the regexp must have the form /x|/ or some such, with
1684 * left parentheses making us short more than one operand.
1686 if (operatorSP >= operandSP) {
1687 operand = NewRENode(state, REOP_EMPTY);
1693 switch (*state->cp) {
1696 if (state->cp + 1 < state->cpend &&
1697 *state->cp == '?' &&
1698 (state->cp[1] == '=' ||
1699 state->cp[1] == '!' ||
1700 state->cp[1] == ':')) {
1701 switch (state->cp[1]) {
1704 /* ASSERT, <next>, ... ASSERTTEST */
1705 state->progLength += 4;
1708 op = REOP_ASSERT_NOT;
1709 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1710 state->progLength += 4;
1713 op = REOP_LPARENNON;
1719 /* LPAREN, <index>, ... RPAREN, <index> */
1721 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1722 state->parenCount++;
1723 if (state->parenCount == 65535) {
1724 ReportRegExpError(state, JSREPORT_ERROR,
1725 JSMSG_TOO_MANY_PARENS);
1733 * If there's no stacked open parenthesis, throw syntax error.
1735 for (i = operatorSP - 1; ; i--) {
1737 ReportRegExpError(state, JSREPORT_ERROR,
1738 JSMSG_UNMATCHED_RIGHT_PAREN);
1741 if (operatorStack[i].op == REOP_ASSERT ||
1742 operatorStack[i].op == REOP_ASSERT_NOT ||
1743 operatorStack[i].op == REOP_LPARENNON ||
1744 operatorStack[i].op == REOP_LPAREN) {
1751 /* Expected an operand before these, so make an empty one */
1752 operand = NewRENode(state, REOP_EMPTY);
1758 if (!ParseTerm(state))
1760 operand = state->result;
1762 if (operandSP == operandStackSize) {
1764 operandStackSize += operandStackSize;
1765 tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1770 operandStack[operandSP++] = operand;
1775 /* At the end; process remaining operators. */
1777 if (state->cp == state->cpend) {
1778 while (operatorSP) {
1780 if (!ProcessOp(state, &operatorStack[operatorSP],
1781 operandStack, operandSP))
1785 assert(operandSP == 1);
1786 state->result = operandStack[0];
1791 switch (*state->cp) {
1793 /* Process any stacked 'concat' operators */
1795 while (operatorSP &&
1796 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1798 if (!ProcessOp(state, &operatorStack[operatorSP],
1799 operandStack, operandSP)) {
1809 * If there's no stacked open parenthesis, throw syntax error.
1811 for (i = operatorSP - 1; ; i--) {
1813 ReportRegExpError(state, JSREPORT_ERROR,
1814 JSMSG_UNMATCHED_RIGHT_PAREN);
1817 if (operatorStack[i].op == REOP_ASSERT ||
1818 operatorStack[i].op == REOP_ASSERT_NOT ||
1819 operatorStack[i].op == REOP_LPARENNON ||
1820 operatorStack[i].op == REOP_LPAREN) {
1826 /* Process everything on the stack until the open parenthesis. */
1830 switch (operatorStack[operatorSP].op) {
1832 case REOP_ASSERT_NOT:
1834 operand = NewRENode(state, operatorStack[operatorSP].op);
1837 operand->u.parenIndex =
1838 operatorStack[operatorSP].parenIndex;
1840 operand->kid = operandStack[operandSP - 1];
1841 operandStack[operandSP - 1] = operand;
1842 if (state->treeDepth == TREE_DEPTH_MAX) {
1843 ReportRegExpError(state, JSREPORT_ERROR,
1844 JSMSG_REGEXP_TOO_COMPLEX);
1850 case REOP_LPARENNON:
1851 state->result = operandStack[operandSP - 1];
1852 if (!ParseQuantifier(state))
1854 operandStack[operandSP - 1] = state->result;
1855 goto restartOperator;
1857 if (!ProcessOp(state, &operatorStack[operatorSP],
1858 operandStack, operandSP))
1868 const WCHAR *errp = state->cp;
1870 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1872 * This didn't even scan correctly as a quantifier, so we should
1886 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1892 /* Anything else is the start of the next term. */
1895 if (operatorSP == operatorStackSize) {
1897 operatorStackSize += operatorStackSize;
1898 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1901 operatorStack = tmp;
1903 operatorStack[operatorSP].op = op;
1904 operatorStack[operatorSP].errPos = state->cp;
1905 operatorStack[operatorSP++].parenIndex = parenIndex;
1910 heap_free(operatorStack);
1911 heap_free(operandStack);
1916 * Save the current state of the match - the position in the input
1917 * text as well as the position in the bytecode. The state of any
1918 * parent expressions is also saved (preceding state).
1919 * Contents of parenCount parentheses from parenIndex are also saved.
1921 static REBackTrackData *
1922 PushBackTrackState(REGlobalData *gData, REOp op,
1923 jsbytecode *target, match_state_t *x, const WCHAR *cp,
1924 size_t parenIndex, size_t parenCount)
1927 REBackTrackData *result =
1928 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1930 size_t sz = sizeof(REBackTrackData) +
1931 gData->stateStackTop * sizeof(REProgState) +
1932 parenCount * sizeof(RECapture);
1934 ptrdiff_t btsize = gData->backTrackStackSize;
1935 ptrdiff_t btincr = ((char *)result + sz) -
1936 ((char *)gData->backTrackStack + btsize);
1938 TRACE("\tBT_Push: %lu,%lu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1940 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1942 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1944 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1945 btincr = ((btincr+btsize-1)/btsize)*btsize;
1946 gData->backTrackStack = heap_pool_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1947 if (!gData->backTrackStack) {
1948 js_ReportOutOfScriptQuota(gData->cx);
1952 gData->backTrackStackSize = btsize + btincr;
1953 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1955 gData->backTrackSP = result;
1956 result->sz = gData->cursz;
1959 result->backtrack_op = op;
1960 result->backtrack_pc = target;
1962 result->parenCount = parenCount;
1963 result->parenIndex = parenIndex;
1965 result->saveStateStackTop = gData->stateStackTop;
1966 assert(gData->stateStackTop);
1967 memcpy(result + 1, gData->stateStack,
1968 sizeof(REProgState) * result->saveStateStackTop);
1970 if (parenCount != 0) {
1971 memcpy((char *)(result + 1) +
1972 sizeof(REProgState) * result->saveStateStackTop,
1973 &x->parens[parenIndex],
1974 sizeof(RECapture) * parenCount);
1975 for (i = 0; i != parenCount; i++)
1976 x->parens[parenIndex + i].index = -1;
1982 static inline match_state_t *
1983 FlatNIMatcher(REGlobalData *gData, match_state_t *x, const WCHAR *matchChars,
1987 assert(gData->cpend >= x->cp);
1988 if (length > (size_t)(gData->cpend - x->cp))
1990 for (i = 0; i != length; i++) {
1991 if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
1999 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2000 * 2. If E is not a character then go to step 6.
2001 * 3. Let ch be E's character.
2002 * 4. Let A be a one-element RECharSet containing the character ch.
2003 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2004 * 6. E must be an integer. Let n be that integer.
2005 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2006 * 8. Return an internal Matcher closure that takes two arguments, a State x
2007 * and a Continuation c, and performs the following:
2008 * 1. Let cap be x's captures internal array.
2009 * 2. Let s be cap[n].
2010 * 3. If s is undefined, then call c(x) and return its result.
2011 * 4. Let e be x's endIndex.
2012 * 5. Let len be s's length.
2013 * 6. Let f be e+len.
2014 * 7. If f>InputLength, return failure.
2015 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2016 * such that Canonicalize(s[i]) is not the same character as
2017 * Canonicalize(Input [e+i]), then return failure.
2018 * 9. Let y be the State (f, cap).
2019 * 10. Call c(y) and return its result.
2021 static match_state_t *
2022 BackrefMatcher(REGlobalData *gData, match_state_t *x, size_t parenIndex)
2025 const WCHAR *parenContent;
2026 RECapture *cap = &x->parens[parenIndex];
2028 if (cap->index == -1)
2032 if (x->cp + len > gData->cpend)
2035 parenContent = &gData->cpbegin[cap->index];
2036 if (gData->regexp->flags & REG_FOLD) {
2037 for (i = 0; i < len; i++) {
2038 if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2042 for (i = 0; i < len; i++) {
2043 if (parenContent[i] != x->cp[i])
2051 /* Add a single character to the RECharSet */
2053 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2055 UINT byteIndex = (UINT)(c >> 3);
2056 assert(c <= cs->length);
2057 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2061 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2063 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2067 UINT byteIndex1 = c1 >> 3;
2068 UINT byteIndex2 = c2 >> 3;
2070 assert(c2 <= cs->length && c1 <= c2);
2075 if (byteIndex1 == byteIndex2) {
2076 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2078 cs->u.bits[byteIndex1] |= 0xFF << c1;
2079 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2080 cs->u.bits[i] = 0xFF;
2081 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2085 /* Compile the source of the class into a RECharSet */
2087 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2089 const WCHAR *src, *end;
2090 BOOL inRange = FALSE;
2091 WCHAR rangeStart = 0;
2096 assert(!charSet->converted);
2098 * Assert that startIndex and length points to chars inside [] inside
2101 assert(1 <= charSet->u.src.startIndex);
2102 assert(charSet->u.src.startIndex < gData->regexp->source_len);
2103 assert(charSet->u.src.length <= gData->regexp->source_len
2104 - 1 - charSet->u.src.startIndex);
2106 charSet->converted = TRUE;
2107 src = gData->regexp->source + charSet->u.src.startIndex;
2109 end = src + charSet->u.src.length;
2111 assert(src[-1] == '[' && end[0] == ']');
2113 byteLength = (charSet->length >> 3) + 1;
2114 charSet->u.bits = heap_alloc(byteLength);
2115 if (!charSet->u.bits) {
2116 JS_ReportOutOfMemory(gData->cx);
2120 memset(charSet->u.bits, 0, byteLength);
2126 assert(charSet->sense == FALSE);
2129 assert(charSet->sense == TRUE);
2132 while (src != end) {
2157 if (src < end && JS_ISWORD(*src)) {
2158 thisCh = (WCHAR)(*src++ & 0x1F);
2171 for (i = 0; (i < nDigits) && (src < end); i++) {
2174 if (!isASCIIHexDigit(c, &digit)) {
2176 * Back off to accepting the original '\'
2183 n = (n << 4) | digit;
2196 * This is a non-ECMA extension - decimal escapes (in this
2197 * case, octal!) are supposed to be an error inside class
2198 * ranges, but supported here for backwards compatibility.
2202 if ('0' <= c && c <= '7') {
2204 n = 8 * n + JS7_UNDEC(c);
2206 if ('0' <= c && c <= '7') {
2208 i = 8 * n + JS7_UNDEC(c);
2219 AddCharacterRangeToCharSet(charSet, '0', '9');
2220 continue; /* don't need range processing */
2222 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2223 AddCharacterRangeToCharSet(charSet,
2225 (WCHAR)charSet->length);
2228 for (i = (INT)charSet->length; i >= 0; i--)
2230 AddCharacterToCharSet(charSet, (WCHAR)i);
2233 for (i = (INT)charSet->length; i >= 0; i--)
2235 AddCharacterToCharSet(charSet, (WCHAR)i);
2238 for (i = (INT)charSet->length; i >= 0; i--)
2240 AddCharacterToCharSet(charSet, (WCHAR)i);
2243 for (i = (INT)charSet->length; i >= 0; i--)
2245 AddCharacterToCharSet(charSet, (WCHAR)i);
2260 if (gData->regexp->flags & REG_FOLD) {
2261 assert(rangeStart <= thisCh);
2262 for (i = rangeStart; i <= thisCh; i++) {
2265 AddCharacterToCharSet(charSet, i);
2269 AddCharacterToCharSet(charSet, uch);
2271 AddCharacterToCharSet(charSet, dch);
2274 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2278 if (gData->regexp->flags & REG_FOLD) {
2279 AddCharacterToCharSet(charSet, toupperW(thisCh));
2280 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2282 AddCharacterToCharSet(charSet, thisCh);
2284 if (src < end - 1) {
2288 rangeStart = thisCh;
2297 ReallocStateStack(REGlobalData *gData)
2299 size_t limit = gData->stateStackLimit;
2300 size_t sz = sizeof(REProgState) * limit;
2302 gData->stateStack = heap_pool_grow(gData->pool, gData->stateStack, sz, sz);
2303 if (!gData->stateStack) {
2304 js_ReportOutOfScriptQuota(gData->cx);
2308 gData->stateStackLimit = limit + limit;
2312 #define PUSH_STATE_STACK(data) \
2314 ++(data)->stateStackTop; \
2315 if ((data)->stateStackTop == (data)->stateStackLimit && \
2316 !ReallocStateStack((data))) { \
2322 * Apply the current op against the given input to see if it's going to match
2323 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2324 * true, then update the current state's cp. Always update startpc to the next
2327 static inline match_state_t *
2328 SimpleMatch(REGlobalData *gData, match_state_t *x, REOp op,
2329 jsbytecode **startpc, BOOL updatecp)
2331 match_state_t *result = NULL;
2334 size_t offset, length, index;
2335 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2336 const WCHAR *source;
2337 const WCHAR *startcp = x->cp;
2341 const char *opname = reop_names[op];
2342 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2343 (int)gData->stateStackTop * 2, "", opname);
2350 if (x->cp != gData->cpbegin) {
2351 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2352 !(gData->regexp->flags & REG_MULTILINE)) {
2355 if (!RE_IS_LINE_TERM(x->cp[-1]))
2361 if (x->cp != gData->cpend) {
2362 if (/*!gData->cx->regExpStatics.multiline &&*/
2363 !(gData->regexp->flags & REG_MULTILINE)) {
2366 if (!RE_IS_LINE_TERM(*x->cp))
2372 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2373 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2378 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2379 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2384 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2390 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2396 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2402 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2408 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2414 if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2420 if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2426 pc = ReadCompactIndex(pc, &parenIndex);
2427 assert(parenIndex < gData->regexp->parenCount);
2428 result = BackrefMatcher(gData, x, parenIndex);
2431 pc = ReadCompactIndex(pc, &offset);
2432 assert(offset < gData->regexp->source_len);
2433 pc = ReadCompactIndex(pc, &length);
2434 assert(1 <= length);
2435 assert(length <= gData->regexp->source_len - offset);
2436 if (length <= (size_t)(gData->cpend - x->cp)) {
2437 source = gData->regexp->source + offset;
2438 TRACE("%s\n", debugstr_wn(source, length));
2439 for (index = 0; index != length; index++) {
2440 if (source[index] != x->cp[index])
2449 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2450 if (x->cp != gData->cpend && *x->cp == matchCh) {
2456 pc = ReadCompactIndex(pc, &offset);
2457 assert(offset < gData->regexp->source_len);
2458 pc = ReadCompactIndex(pc, &length);
2459 assert(1 <= length);
2460 assert(length <= gData->regexp->source_len - offset);
2461 source = gData->regexp->source;
2462 result = FlatNIMatcher(gData, x, source + offset, length);
2466 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2472 matchCh = GET_ARG(pc);
2473 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2475 if (x->cp != gData->cpend && *x->cp == matchCh) {
2481 matchCh = GET_ARG(pc);
2483 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2489 pc = ReadCompactIndex(pc, &index);
2490 assert(index < gData->regexp->classCount);
2491 if (x->cp != gData->cpend) {
2492 charSet = &gData->regexp->classList[index];
2493 assert(charSet->converted);
2496 if (charSet->length != 0 &&
2497 ch <= charSet->length &&
2498 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2505 pc = ReadCompactIndex(pc, &index);
2506 assert(index < gData->regexp->classCount);
2507 if (x->cp != gData->cpend) {
2508 charSet = &gData->regexp->classList[index];
2509 assert(charSet->converted);
2512 if (charSet->length == 0 ||
2513 ch > charSet->length ||
2514 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2535 static inline match_state_t *
2536 ExecuteREBytecode(REGlobalData *gData, match_state_t *x)
2538 match_state_t *result = NULL;
2539 REBackTrackData *backTrackData;
2540 jsbytecode *nextpc, *testpc;
2543 REProgState *curState;
2544 const WCHAR *startcp;
2545 size_t parenIndex, k;
2546 size_t parenSoFar = 0;
2548 WCHAR matchCh1, matchCh2;
2552 jsbytecode *pc = gData->regexp->program;
2553 REOp op = (REOp) *pc++;
2556 * If the first node is a simple match, step the index into the string
2557 * until that match is made, or fail if it can't be found at all.
2559 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & REG_STICKY)) {
2561 while (x->cp <= gData->cpend) {
2562 nextpc = pc; /* reset back to start each time */
2563 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2567 pc = nextpc; /* accept skip to next opcode */
2569 assert(op < REOP_LIMIT);
2580 const char *opname = reop_names[op];
2581 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2582 (int)gData->stateStackTop * 2, "", opname);
2584 if (REOP_IS_SIMPLE(op)) {
2585 result = SimpleMatch(gData, x, op, &pc, TRUE);
2587 curState = &gData->stateStack[gData->stateStackTop];
2591 case REOP_ALTPREREQ2:
2592 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2594 matchCh2 = GET_ARG(pc);
2599 if (x->cp != gData->cpend) {
2600 if (*x->cp == matchCh2)
2603 charSet = &gData->regexp->classList[k];
2604 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2608 if ((charSet->length == 0 ||
2609 matchCh1 > charSet->length ||
2610 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2618 case REOP_ALTPREREQ:
2619 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2621 matchCh1 = GET_ARG(pc);
2623 matchCh2 = GET_ARG(pc);
2625 if (x->cp == gData->cpend ||
2626 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2630 /* else fall through... */
2634 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2635 pc += ARG_LEN; /* start of this alternate */
2636 curState->parenSoFar = parenSoFar;
2637 PUSH_STATE_STACK(gData);
2640 if (REOP_IS_SIMPLE(op)) {
2641 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2642 op = (REOp) *nextpc++;
2649 nextop = (REOp) *nextpc++;
2650 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2655 * Occurs at (successful) end of REOP_ALT,
2659 * If we have not gotten a result here, it is because of an
2660 * empty match. Do the same thing REOP_EMPTY would do.
2665 --gData->stateStackTop;
2666 pc += GET_OFFSET(pc);
2671 * Occurs at last (successful) end of REOP_ALT,
2675 * If we have not gotten a result here, it is because of an
2676 * empty match. Do the same thing REOP_EMPTY would do.
2681 --gData->stateStackTop;
2686 pc = ReadCompactIndex(pc, &parenIndex);
2687 TRACE("[ %lu ]\n", (ULONG_PTR)parenIndex);
2688 assert(parenIndex < gData->regexp->parenCount);
2689 if (parenIndex + 1 > parenSoFar)
2690 parenSoFar = parenIndex + 1;
2691 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2692 x->parens[parenIndex].length = 0;
2700 pc = ReadCompactIndex(pc, &parenIndex);
2701 assert(parenIndex < gData->regexp->parenCount);
2702 cap = &x->parens[parenIndex];
2703 delta = x->cp - (gData->cpbegin + cap->index);
2704 cap->length = (delta < 0) ? 0 : (size_t) delta;
2712 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2713 pc += ARG_LEN; /* start of ASSERT child */
2716 if (REOP_IS_SIMPLE(op) &&
2717 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2721 curState->u.assertion.top =
2722 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2723 curState->u.assertion.sz = gData->cursz;
2724 curState->index = x->cp - gData->cpbegin;
2725 curState->parenSoFar = parenSoFar;
2726 PUSH_STATE_STACK(gData);
2727 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2728 nextpc, x, x->cp, 0, 0)) {
2733 case REOP_ASSERT_NOT:
2734 nextpc = pc + GET_OFFSET(pc);
2738 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2739 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2740 *testpc == REOP_ASSERTNOTTEST) {
2744 curState->u.assertion.top
2745 = (char *)gData->backTrackSP -
2746 (char *)gData->backTrackStack;
2747 curState->u.assertion.sz = gData->cursz;
2748 curState->index = x->cp - gData->cpbegin;
2749 curState->parenSoFar = parenSoFar;
2750 PUSH_STATE_STACK(gData);
2751 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2752 nextpc, x, x->cp, 0, 0)) {
2757 case REOP_ASSERTTEST:
2758 --gData->stateStackTop;
2760 x->cp = gData->cpbegin + curState->index;
2761 gData->backTrackSP =
2762 (REBackTrackData *) ((char *)gData->backTrackStack +
2763 curState->u.assertion.top);
2764 gData->cursz = curState->u.assertion.sz;
2769 case REOP_ASSERTNOTTEST:
2770 --gData->stateStackTop;
2772 x->cp = gData->cpbegin + curState->index;
2773 gData->backTrackSP =
2774 (REBackTrackData *) ((char *)gData->backTrackStack +
2775 curState->u.assertion.top);
2776 gData->cursz = curState->u.assertion.sz;
2777 result = (!result) ? x : NULL;
2780 curState->u.quantifier.min = 0;
2781 curState->u.quantifier.max = (UINT)-1;
2784 curState->u.quantifier.min = 1;
2785 curState->u.quantifier.max = (UINT)-1;
2788 curState->u.quantifier.min = 0;
2789 curState->u.quantifier.max = 1;
2792 pc = ReadCompactIndex(pc, &k);
2793 curState->u.quantifier.min = k;
2794 pc = ReadCompactIndex(pc, &k);
2795 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2796 curState->u.quantifier.max = k - 1;
2797 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2799 if (curState->u.quantifier.max == 0) {
2800 pc = pc + GET_OFFSET(pc);
2805 /* Step over <next> */
2806 nextpc = pc + ARG_LEN;
2807 op = (REOp) *nextpc++;
2809 if (REOP_IS_SIMPLE(op)) {
2810 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2811 if (curState->u.quantifier.min == 0)
2815 pc = pc + GET_OFFSET(pc);
2818 op = (REOp) *nextpc++;
2821 curState->index = startcp - gData->cpbegin;
2822 curState->continue_op = REOP_REPEAT;
2823 curState->continue_pc = pc;
2824 curState->parenSoFar = parenSoFar;
2825 PUSH_STATE_STACK(gData);
2826 if (curState->u.quantifier.min == 0 &&
2827 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2834 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2835 pc = curState[-1].continue_pc;
2836 op = (REOp) curState[-1].continue_op;
2845 --gData->stateStackTop;
2847 /* Failed, see if we have enough children. */
2848 if (curState->u.quantifier.min == 0)
2852 if (curState->u.quantifier.min == 0 &&
2853 x->cp == gData->cpbegin + curState->index) {
2854 /* matched an empty string, that'll get us nowhere */
2858 if (curState->u.quantifier.min != 0)
2859 curState->u.quantifier.min--;
2860 if (curState->u.quantifier.max != (UINT) -1)
2861 curState->u.quantifier.max--;
2862 if (curState->u.quantifier.max == 0)
2864 nextpc = pc + ARG_LEN;
2865 nextop = (REOp) *nextpc;
2867 if (REOP_IS_SIMPLE(nextop)) {
2869 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2870 if (curState->u.quantifier.min == 0)
2877 curState->index = startcp - gData->cpbegin;
2878 PUSH_STATE_STACK(gData);
2879 if (curState->u.quantifier.min == 0 &&
2880 !PushBackTrackState(gData, REOP_REPEAT,
2882 curState->parenSoFar,
2884 curState->parenSoFar)) {
2887 } while (*nextpc == REOP_ENDCHILD);
2890 parenSoFar = curState->parenSoFar;
2895 pc += GET_OFFSET(pc);
2898 case REOP_MINIMALSTAR:
2899 curState->u.quantifier.min = 0;
2900 curState->u.quantifier.max = (UINT)-1;
2901 goto minimalquantcommon;
2902 case REOP_MINIMALPLUS:
2903 curState->u.quantifier.min = 1;
2904 curState->u.quantifier.max = (UINT)-1;
2905 goto minimalquantcommon;
2906 case REOP_MINIMALOPT:
2907 curState->u.quantifier.min = 0;
2908 curState->u.quantifier.max = 1;
2909 goto minimalquantcommon;
2910 case REOP_MINIMALQUANT:
2911 pc = ReadCompactIndex(pc, &k);
2912 curState->u.quantifier.min = k;
2913 pc = ReadCompactIndex(pc, &k);
2914 /* See REOP_QUANT comments about k - 1. */
2915 curState->u.quantifier.max = k - 1;
2916 assert(curState->u.quantifier.min
2917 <= curState->u.quantifier.max);
2919 curState->index = x->cp - gData->cpbegin;
2920 curState->parenSoFar = parenSoFar;
2921 PUSH_STATE_STACK(gData);
2922 if (curState->u.quantifier.min != 0) {
2923 curState->continue_op = REOP_MINIMALREPEAT;
2924 curState->continue_pc = pc;
2925 /* step over <next> */
2929 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2930 pc, x, x->cp, 0, 0)) {
2933 --gData->stateStackTop;
2934 pc = pc + GET_OFFSET(pc);
2939 case REOP_MINIMALREPEAT:
2940 --gData->stateStackTop;
2943 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2944 #define PREPARE_REPEAT() \
2946 curState->index = x->cp - gData->cpbegin; \
2947 curState->continue_op = REOP_MINIMALREPEAT; \
2948 curState->continue_pc = pc; \
2950 for (k = curState->parenSoFar; k < parenSoFar; k++) \
2951 x->parens[k].index = -1; \
2952 PUSH_STATE_STACK(gData); \
2953 op = (REOp) *pc++; \
2954 assert(op < REOP_LIMIT); \
2960 * Non-greedy failure - try to consume another child.
2962 if (curState->u.quantifier.max == (UINT) -1 ||
2963 curState->u.quantifier.max > 0) {
2967 /* Don't need to adjust pc since we're going to pop. */
2970 if (curState->u.quantifier.min == 0 &&
2971 x->cp == gData->cpbegin + curState->index) {
2972 /* Matched an empty string, that'll get us nowhere. */
2976 if (curState->u.quantifier.min != 0)
2977 curState->u.quantifier.min--;
2978 if (curState->u.quantifier.max != (UINT) -1)
2979 curState->u.quantifier.max--;
2980 if (curState->u.quantifier.min != 0) {
2984 curState->index = x->cp - gData->cpbegin;
2985 curState->parenSoFar = parenSoFar;
2986 PUSH_STATE_STACK(gData);
2987 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2989 curState->parenSoFar,
2990 parenSoFar - curState->parenSoFar)) {
2993 --gData->stateStackTop;
2994 pc = pc + GET_OFFSET(pc);
2996 assert(op < REOP_LIMIT);
3006 * If the match failed and there's a backtrack option, take it.
3007 * Otherwise this is a complete and utter failure.
3010 if (gData->cursz == 0)
3013 /* Potentially detect explosive regex here. */
3014 gData->backTrackCount++;
3015 if (gData->backTrackLimit &&
3016 gData->backTrackCount >= gData->backTrackLimit) {
3017 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3018 JSMSG_REGEXP_TOO_COMPLEX);
3023 backTrackData = gData->backTrackSP;
3024 gData->cursz = backTrackData->sz;
3025 gData->backTrackSP =
3026 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3027 x->cp = backTrackData->cp;
3028 pc = backTrackData->backtrack_pc;
3029 op = (REOp) backTrackData->backtrack_op;
3030 assert(op < REOP_LIMIT);
3031 gData->stateStackTop = backTrackData->saveStateStackTop;
3032 assert(gData->stateStackTop);
3034 memcpy(gData->stateStack, backTrackData + 1,
3035 sizeof(REProgState) * backTrackData->saveStateStackTop);
3036 curState = &gData->stateStack[gData->stateStackTop - 1];
3038 if (backTrackData->parenCount) {
3039 memcpy(&x->parens[backTrackData->parenIndex],
3040 (char *)(backTrackData + 1) +
3041 sizeof(REProgState) * backTrackData->saveStateStackTop,
3042 sizeof(RECapture) * backTrackData->parenCount);
3043 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3045 for (k = curState->parenSoFar; k < parenSoFar; k++)
3046 x->parens[k].index = -1;
3047 parenSoFar = curState->parenSoFar;
3050 TRACE("\tBT_Pop: %ld,%ld\n",
3051 (ULONG_PTR)backTrackData->parenIndex,
3052 (ULONG_PTR)backTrackData->parenCount);
3058 * Continue with the expression.
3061 assert(op < REOP_LIMIT);
3073 static match_state_t *MatchRegExp(REGlobalData *gData, match_state_t *x)
3075 match_state_t *result;
3076 const WCHAR *cp = x->cp;
3081 * Have to include the position beyond the last character
3082 * in order to detect end-of-input/line condition.
3084 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3085 gData->skipped = cp2 - cp;
3087 for (j = 0; j < gData->regexp->parenCount; j++)
3088 x->parens[j].index = -1;
3089 result = ExecuteREBytecode(gData, x);
3090 if (!gData->ok || result || (gData->regexp->flags & REG_STICKY))
3092 gData->backTrackSP = gData->backTrackStack;
3094 gData->stateStackTop = 0;
3095 cp2 = cp + gData->skipped;
3100 static HRESULT InitMatch(regexp_t *re, void *cx, heap_pool_t *pool, REGlobalData *gData)
3104 gData->backTrackStackSize = INITIAL_BACKTRACK;
3105 gData->backTrackStack = heap_pool_alloc(gData->pool, INITIAL_BACKTRACK);
3106 if (!gData->backTrackStack)
3109 gData->backTrackSP = gData->backTrackStack;
3111 gData->backTrackCount = 0;
3112 gData->backTrackLimit = 0;
3114 gData->stateStackLimit = INITIAL_STATESTACK;
3115 gData->stateStack = heap_pool_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3116 if (!gData->stateStack)
3119 gData->stateStackTop = 0;
3125 for (i = 0; i < re->classCount; i++) {
3126 if (!re->classList[i].converted &&
3127 !ProcessCharSet(gData, &re->classList[i])) {
3135 js_ReportOutOfScriptQuota(cx);
3137 return E_OUTOFMEMORY;
3140 HRESULT regexp_execute(regexp_t *regexp, void *cx, heap_pool_t *pool,
3141 const WCHAR *str, DWORD str_len, match_state_t *result)
3145 heap_pool_t *mark = heap_pool_mark(pool);
3146 const WCHAR *str_beg = result->cp;
3149 assert(result->cp != NULL);
3151 gData.cpbegin = str;
3152 gData.cpend = str+str_len;
3153 gData.start = result->cp-str;
3157 hres = InitMatch(regexp, cx, pool, &gData);
3159 WARN("InitMatch failed\n");
3160 heap_pool_clear(mark);
3164 res = MatchRegExp(&gData, result);
3165 heap_pool_clear(mark);
3167 WARN("MatchRegExp failed\n");
3172 result->match_len = 0;
3176 result->match_len = (result->cp-str_beg) - gData.skipped;
3177 result->paren_count = regexp->parenCount;
3181 void regexp_destroy(regexp_t *re)
3183 if (re->classList) {
3185 for (i = 0; i < re->classCount; i++) {
3186 if (re->classList[i].converted)
3187 heap_free(re->classList[i].u.bits);
3188 re->classList[i].u.bits = NULL;
3190 heap_free(re->classList);
3195 regexp_t* regexp_new(void *cx, heap_pool_t *pool, const WCHAR *str,
3196 DWORD str_len, WORD flags, BOOL flat)
3200 CompilerState state;
3207 mark = heap_pool_mark(pool);
3215 state.cpbegin = state.cp;
3216 state.cpend = state.cp + len;
3217 state.flags = flags;
3218 state.parenCount = 0;
3219 state.classCount = 0;
3220 state.progLength = 0;
3221 state.treeDepth = 0;
3222 state.classBitmapsMem = 0;
3223 for (i = 0; i < CLASS_CACHE_SIZE; i++)
3224 state.classCache[i].start = NULL;
3226 if (len != 0 && flat) {
3227 state.result = NewRENode(&state, REOP_FLAT);
3230 state.result->u.flat.chr = *state.cpbegin;
3231 state.result->u.flat.length = len;
3232 state.result->kid = (void *) state.cpbegin;
3233 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3234 state.progLength += 1 + GetCompactIndexWidth(0)
3235 + GetCompactIndexWidth(len);
3237 if (!ParseRegExp(&state))
3240 resize = offsetof(regexp_t, program) + state.progLength + 1;
3241 re = heap_alloc(resize);
3245 assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3246 re->classCount = state.classCount;
3247 if (re->classCount) {
3248 re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3249 if (!re->classList) {
3254 for (i = 0; i < re->classCount; i++)
3255 re->classList[i].converted = FALSE;
3257 re->classList = NULL;
3259 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3265 *endPC++ = REOP_END;
3267 * Check whether size was overestimated and shrink using realloc.
3268 * This is safe since no pointers to newly parsed regexp or its parts
3269 * besides re exist here.
3271 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3273 assert((size_t)(endPC - re->program) < state.progLength + 1);
3274 resize = offsetof(regexp_t, program) + (endPC - re->program);
3275 tmp = heap_realloc(re, resize);
3281 re->parenCount = state.parenCount;
3283 re->source_len = str_len;
3286 heap_pool_clear(mark);
3290 HRESULT regexp_set_flags(regexp_t **regexp, void *cx, heap_pool_t *pool, WORD flags)
3292 if(((*regexp)->flags & REG_FOLD) != (flags & REG_FOLD)) {
3293 regexp_t *new_regexp = regexp_new(cx, pool, (*regexp)->source,
3294 (*regexp)->source_len, flags, FALSE);
3299 regexp_destroy(*regexp);
3300 *regexp = new_regexp;
3302 (*regexp)->flags = flags;