2 * Copyright 2008 Jacek Caban for CodeWeavers
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 * Code in this file is based on files:
23 * from Mozilla project, released under LGPL 2.1 or later.
25 * The Original Code is Mozilla Communicator client code, released
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
38 #include "wine/debug.h"
40 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
42 #define JSREG_FOLD 0x01 /* fold uppercase to lowercase */
43 #define JSREG_GLOB 0x02 /* global exec, creates array of matches */
44 #define JSREG_MULTILINE 0x04 /* treat ^ and $ as begin and end of line */
45 #define JSREG_STICKY 0x08 /* only match starting at lastIndex */
47 typedef BYTE JSPackedBool;
48 typedef BYTE jsbytecode;
51 * This struct holds a bitmap representation of a class from a regexp.
52 * There's a list of these referenced by the classList field in the JSRegExp
53 * struct below. The initial state has startIndex set to the offset in the
54 * original regexp source of the beginning of the class contents. The first
55 * use of the class converts the source representation into a bitmap.
58 typedef struct RECharSet {
59 JSPackedBool converted;
72 WORD flags; /* flags, see jsapi.h's JSREG_* defines */
73 size_t parenCount; /* number of parenthesized submatches */
74 size_t classCount; /* count [...] bitmaps */
75 RECharSet *classList; /* list of [...] bitmaps */
76 BSTR source; /* locked source string, sans // */
77 jsbytecode program[1]; /* regular expression bytecode */
87 static const WCHAR sourceW[] = {'s','o','u','r','c','e',0};
88 static const WCHAR globalW[] = {'g','l','o','b','a','l',0};
89 static const WCHAR ignoreCaseW[] = {'i','g','n','o','r','e','C','a','s','e',0};
90 static const WCHAR multilineW[] = {'m','u','l','t','i','l','i','n','e',0};
91 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
92 static const WCHAR toStringW[] = {'t','o','S','t','r','i','n','g',0};
93 static const WCHAR toLocaleStringW[] = {'t','o','L','o','c','a','l','e','S','t','r','i','n','g',0};
94 static const WCHAR hasOwnPropertyW[] = {'h','a','s','O','w','n','P','r','o','p','e','r','t','y',0};
95 static const WCHAR propertyIsEnumerableW[] =
96 {'p','r','o','p','e','r','t','y','I','s','E','n','u','m','e','r','a','b','l','e',0};
97 static const WCHAR isPrototypeOfW[] = {'i','s','P','r','o','t','o','t','y','p','e','O','f',0};
98 static const WCHAR execW[] = {'e','x','e','c',0};
100 static const WCHAR emptyW[] = {0};
102 /* FIXME: Better error handling */
103 #define ReportRegExpError(a,b,c)
104 #define ReportRegExpErrorHelper(a,b,c,d)
105 #define JS_ReportErrorNumber(a,b,c,d)
106 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
107 #define js_ReportOutOfScriptQuota(a)
108 #define JS_ReportOutOfMemory(a)
109 #define JS_COUNT_OPERATION(a,b)
111 #define JSMSG_MIN_TOO_BIG 47
112 #define JSMSG_MAX_TOO_BIG 48
113 #define JSMSG_OUT_OF_ORDER 49
114 #define JSMSG_OUT_OF_MEMORY 137
116 #define LINE_SEPARATOR 0x2028
117 #define PARA_SEPARATOR 0x2029
119 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
120 ((c >= 'a') && (c <= 'z')) )
121 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
122 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
124 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
126 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
127 #define JS7_UNDEC(c) ((c) - '0')
179 REOP_LIMIT /* META: no operator >= to this */
182 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
184 const char *reop_names[] = {
237 typedef struct RECapture {
238 ptrdiff_t index; /* start of contents, -1 for empty */
239 size_t length; /* length of capture */
242 typedef struct REMatchState {
244 RECapture parens[1]; /* first of 're->parenCount' captures,
245 allocated at end of this struct */
248 typedef struct REProgState {
249 jsbytecode *continue_pc; /* current continuation data */
250 jsbytecode continue_op;
251 ptrdiff_t index; /* progress in text */
252 size_t parenSoFar; /* highest indexed paren started */
255 UINT min; /* current quantifier limits */
259 size_t top; /* backtrack stack state */
265 typedef struct REBackTrackData {
266 size_t sz; /* size of previous stack entry */
267 jsbytecode *backtrack_pc; /* where to backtrack to */
268 jsbytecode backtrack_op;
269 const WCHAR *cp; /* index in text of match at backtrack */
270 size_t parenIndex; /* start index of saved paren contents */
271 size_t parenCount; /* # of saved paren contents */
272 size_t saveStateStackTop; /* number of parent states */
273 /* saved parent states follow */
274 /* saved paren contents follow */
277 #define INITIAL_STATESTACK 100
278 #define INITIAL_BACKTRACK 8000
280 typedef struct REGlobalData {
282 JSRegExp *regexp; /* the RE in execution */
283 BOOL ok; /* runtime error (out_of_memory only?) */
284 size_t start; /* offset to start at */
285 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
286 const WCHAR *cpbegin; /* text base address */
287 const WCHAR *cpend; /* text limit address */
289 REProgState *stateStack; /* stack of state of current parents */
290 size_t stateStackTop;
291 size_t stateStackLimit;
293 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
294 REBackTrackData *backTrackSP;
295 size_t backTrackStackSize;
296 size_t cursz; /* size of current stack entry */
297 size_t backTrackCount; /* how many times we've backtracked */
298 size_t backTrackLimit; /* upper limit on backtrack states */
300 jsheap_t *pool; /* It's faster to use one malloc'd pool
301 than to malloc/free the three items
302 that are allocated from this pool */
305 typedef struct RENode RENode;
307 REOp op; /* r.e. op bytecode */
308 RENode *next; /* next in concatenation order */
309 void *kid; /* first operand */
311 void *kid2; /* second operand */
312 INT num; /* could be a number */
313 size_t parenIndex; /* or a parenthesis index */
314 struct { /* or a quantifier range */
319 struct { /* or a character class */
321 size_t kidlen; /* length of string at kid, in jschars */
322 size_t index; /* index into class list */
323 WORD bmsize; /* bitmap size, based on max char code */
326 struct { /* or a literal sequence */
327 WCHAR chr; /* of one character */
328 size_t length; /* or many (via the kid) */
331 RENode *kid2; /* second operand from ALT */
332 WCHAR ch1; /* match char for ALTPREREQ */
333 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
338 #define CLASS_CACHE_SIZE 4
340 typedef struct CompilerState {
341 script_ctx_t *context;
342 const WCHAR *cpbegin;
346 size_t classCount; /* number of [] encountered */
347 size_t treeDepth; /* maximum depth of parse tree */
348 size_t progLength; /* estimated bytecode length */
350 size_t classBitmapsMem; /* memory to hold all class bitmaps */
352 const WCHAR *start; /* small cache of class strings */
353 size_t length; /* since they're often the same */
355 } classCache[CLASS_CACHE_SIZE];
359 typedef struct EmitStateStackEntry {
360 jsbytecode *altHead; /* start of REOP_ALT* opcode */
361 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
362 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
363 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
364 RENode *continueNode; /* original REOP_ALT* node being stacked */
365 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
366 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
367 avoid 16-bit unsigned offset overflow */
368 } EmitStateStackEntry;
371 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
372 * the getters and setters take the pc of the offset, not of the opcode before
376 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
377 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
378 (pc)[1] = (jsbytecode) (arg))
380 #define OFFSET_LEN ARG_LEN
381 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
382 #define GET_OFFSET(pc) GET_ARG(pc)
384 static BOOL ParseRegExp(CompilerState*);
387 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
388 * For sanity, we limit it to 2^24 bytes.
390 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
393 * The maximum memory that can be allocated for class bitmaps.
394 * For sanity, we limit it to 2^24 bytes.
396 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
399 * Functions to get size and write/read bytecode that represent small indexes
401 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
402 * indicates that the following byte brings more bits to the index. Otherwise
403 * this is the last byte in the index bytecode representing highest index bits.
406 GetCompactIndexWidth(size_t index)
410 for (width = 1; (index >>= 7) != 0; ++width) { }
414 static inline jsbytecode *
415 WriteCompactIndex(jsbytecode *pc, size_t index)
419 while ((next = index >> 7) != 0) {
420 *pc++ = (jsbytecode)(index | 0x80);
423 *pc++ = (jsbytecode)index;
427 static inline jsbytecode *
428 ReadCompactIndex(jsbytecode *pc, size_t *result)
433 if ((nextByte & 0x80) == 0) {
435 * Short-circuit the most common case when compact index <= 127.
440 *result = 0x7F & nextByte;
443 *result |= (nextByte & 0x7F) << shift;
445 } while ((nextByte & 0x80) != 0);
450 /* Construct and initialize an RENode, returning NULL for out-of-memory */
452 NewRENode(CompilerState *state, REOp op)
456 ren = jsheap_alloc(&state->context->tmp_heap, sizeof(*ren));
458 /* js_ReportOutOfScriptQuota(cx); */
468 * Validates and converts hex ascii value.
471 isASCIIHexDigit(WCHAR c, UINT *digit)
482 if (cv >= 'a' && cv <= 'f') {
483 *digit = cv - 'a' + 10;
495 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
496 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
499 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
501 ptrdiff_t offset = target - jump;
503 /* Check that target really points forward. */
505 if ((size_t)offset > OFFSET_MAX)
508 jump[0] = JUMP_OFFSET_HI(offset);
509 jump[1] = JUMP_OFFSET_LO(offset);
514 * Generate bytecode for the tree rooted at t using an explicit stack instead
518 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
519 jsbytecode *pc, RENode *t)
521 EmitStateStackEntry *emitStateSP, *emitStateStack;
525 if (treeDepth == 0) {
526 emitStateStack = NULL;
528 emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
532 emitStateSP = emitStateStack;
534 assert(op < REOP_LIMIT);
543 case REOP_ALTPREREQ2:
546 emitStateSP->altHead = pc - 1;
547 emitStateSP->endTermFixup = pc;
549 SET_ARG(pc, t->u.altprereq.ch1);
551 SET_ARG(pc, t->u.altprereq.ch2);
554 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
557 emitStateSP->continueNode = t;
558 emitStateSP->continueOp = REOP_JUMP;
559 emitStateSP->jumpToJumpFlag = FALSE;
561 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
562 t = (RENode *) t->kid;
564 assert(op < REOP_LIMIT);
568 emitStateSP->nextTermFixup = pc; /* offset to following term */
570 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
572 emitStateSP->continueOp = REOP_ENDALT;
574 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
575 t = (RENode *) t->u.kid2;
577 assert(op < REOP_LIMIT);
582 * If we already patched emitStateSP->nextTermFixup to jump to
583 * a nearer jump, to avoid 16-bit immediate offset overflow, we
586 if (emitStateSP->jumpToJumpFlag)
590 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
591 * REOP_ENDALT is executed only on successful match of the last
592 * alternate in a group.
594 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
596 if (t->op != REOP_ALT) {
597 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
602 * If the program is bigger than the REOP_JUMP offset range, then
603 * we must check for alternates before this one that are part of
604 * the same group, and fix up their jump offsets to target jumps
605 * close enough to fit in a 16-bit unsigned offset immediate.
607 if ((size_t)(pc - re->program) > OFFSET_MAX &&
608 emitStateSP > emitStateStack) {
609 EmitStateStackEntry *esp, *esp2;
610 jsbytecode *alt, *jump;
611 ptrdiff_t span, header;
615 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
616 if (esp->continueOp == REOP_ENDALT &&
617 !esp->jumpToJumpFlag &&
618 esp->nextTermFixup + OFFSET_LEN == alt &&
619 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
621 : esp->nextTermFixup)) > OFFSET_MAX) {
623 jump = esp->nextTermFixup;
626 * The span must be 1 less than the distance from
627 * jump offset to jump offset, so we actually jump
628 * to a REOP_JUMP bytecode, not to its offset!
631 assert(jump < esp2->nextTermFixup);
632 span = esp2->nextTermFixup - jump - 1;
633 if ((size_t)span <= OFFSET_MAX)
638 } while (esp2->continueOp != REOP_ENDALT);
641 jump[0] = JUMP_OFFSET_HI(span);
642 jump[1] = JUMP_OFFSET_LO(span);
644 if (esp->continueNode->op != REOP_ALT) {
646 * We must patch the offset at esp->endTermFixup
647 * as well, for the REOP_ALTPREREQ{,2} opcodes.
648 * If we're unlucky and endTermFixup is more than
649 * OFFSET_MAX bytes from its target, we cheat by
650 * jumping 6 bytes to the jump whose offset is at
651 * esp->nextTermFixup, which has the same target.
653 jump = esp->endTermFixup;
654 header = esp->nextTermFixup - jump;
656 if ((size_t)span > OFFSET_MAX)
659 jump[0] = JUMP_OFFSET_HI(span);
660 jump[1] = JUMP_OFFSET_LO(span);
663 esp->jumpToJumpFlag = TRUE;
671 emitStateSP->altHead = pc - 1;
672 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
674 emitStateSP->continueNode = t;
675 emitStateSP->continueOp = REOP_JUMP;
676 emitStateSP->jumpToJumpFlag = FALSE;
678 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
679 t = (RENode *) t->kid;
681 assert(op < REOP_LIMIT);
686 * Coalesce FLATs if possible and if it would not increase bytecode
687 * beyond preallocated limit. The latter happens only when bytecode
688 * size for coalesced string with offset p and length 2 exceeds 6
689 * bytes preallocated for 2 single char nodes, i.e. when
690 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
691 * GetCompactIndexWidth(p) > 4.
692 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
693 * nodes strictly decreases bytecode size, the check has to be
694 * done only for the first coalescing.
697 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
700 t->next->op == REOP_FLAT &&
701 (WCHAR*)t->kid + t->u.flat.length ==
702 (WCHAR*)t->next->kid) {
703 t->u.flat.length += t->next->u.flat.length;
704 t->next = t->next->next;
707 if (t->kid && t->u.flat.length > 1) {
708 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
709 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
710 pc = WriteCompactIndex(pc, t->u.flat.length);
711 } else if (t->u.flat.chr < 256) {
712 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
713 *pc++ = (jsbytecode) t->u.flat.chr;
715 pc[-1] = (state->flags & JSREG_FOLD)
718 SET_ARG(pc, t->u.flat.chr);
725 pc = WriteCompactIndex(pc, t->u.parenIndex);
726 emitStateSP->continueNode = t;
727 emitStateSP->continueOp = REOP_RPAREN;
729 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
730 t = (RENode *) t->kid;
735 pc = WriteCompactIndex(pc, t->u.parenIndex);
739 pc = WriteCompactIndex(pc, t->u.parenIndex);
744 emitStateSP->nextTermFixup = pc;
746 emitStateSP->continueNode = t;
747 emitStateSP->continueOp = REOP_ASSERTTEST;
749 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
750 t = (RENode *) t->kid;
754 case REOP_ASSERTTEST:
755 case REOP_ASSERTNOTTEST:
756 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
760 case REOP_ASSERT_NOT:
762 emitStateSP->nextTermFixup = pc;
764 emitStateSP->continueNode = t;
765 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
767 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
768 t = (RENode *) t->kid;
774 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
775 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
776 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
777 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
778 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
779 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
781 if (!t->u.range.greedy)
782 pc[-1] = REOP_MINIMALQUANT;
783 pc = WriteCompactIndex(pc, t->u.range.min);
785 * Write max + 1 to avoid using size_t(max) + 1 bytes
786 * for (UINT)-1 sentinel.
788 pc = WriteCompactIndex(pc, t->u.range.max + 1);
790 emitStateSP->nextTermFixup = pc;
792 emitStateSP->continueNode = t;
793 emitStateSP->continueOp = REOP_ENDCHILD;
795 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
796 t = (RENode *) t->kid;
801 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
806 if (!t->u.ucclass.sense)
807 pc[-1] = REOP_NCLASS;
808 pc = WriteCompactIndex(pc, t->u.ucclass.index);
809 charSet = &re->classList[t->u.ucclass.index];
810 charSet->converted = FALSE;
811 charSet->length = t->u.ucclass.bmsize;
812 charSet->u.src.startIndex = t->u.ucclass.startIndex;
813 charSet->u.src.length = t->u.ucclass.kidlen;
814 charSet->sense = t->u.ucclass.sense;
825 if (emitStateSP == emitStateStack)
828 t = emitStateSP->continueNode;
829 op = (REOp) emitStateSP->continueOp;
834 heap_free(emitStateStack);
838 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
844 * Process the op against the two top operands, reducing them to a single
845 * operand in the penultimate slot. Update progLength and treeDepth.
848 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
853 switch (opData->op) {
855 result = NewRENode(state, REOP_ALT);
858 result->kid = operandStack[operandSP - 2];
859 result->u.kid2 = operandStack[operandSP - 1];
860 operandStack[operandSP - 2] = result;
862 if (state->treeDepth == TREE_DEPTH_MAX) {
863 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
869 * Look at both alternates to see if there's a FLAT or a CLASS at
870 * the start of each. If so, use a prerequisite match.
872 if (((RENode *) result->kid)->op == REOP_FLAT &&
873 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
874 (state->flags & JSREG_FOLD) == 0) {
875 result->op = REOP_ALTPREREQ;
876 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
877 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
878 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
879 JUMP, <end> ... ENDALT */
880 state->progLength += 13;
883 if (((RENode *) result->kid)->op == REOP_CLASS &&
884 ((RENode *) result->kid)->u.ucclass.index < 256 &&
885 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
886 (state->flags & JSREG_FOLD) == 0) {
887 result->op = REOP_ALTPREREQ2;
888 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
889 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
890 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
891 JUMP, <end> ... ENDALT */
892 state->progLength += 13;
895 if (((RENode *) result->kid)->op == REOP_FLAT &&
896 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
897 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
898 (state->flags & JSREG_FOLD) == 0) {
899 result->op = REOP_ALTPREREQ2;
900 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
901 result->u.altprereq.ch2 =
902 ((RENode *) result->u.kid2)->u.ucclass.index;
903 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
904 JUMP, <end> ... ENDALT */
905 state->progLength += 13;
908 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
909 state->progLength += 7;
914 result = operandStack[operandSP - 2];
916 result = result->next;
917 result->next = operandStack[operandSP - 1];
921 case REOP_ASSERT_NOT:
924 /* These should have been processed by a close paren. */
925 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
935 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
936 * its being on the stack, and to propagate errors to its callers.
938 #define JSREG_FIND_PAREN_COUNT 0x8000
939 #define JSREG_FIND_PAREN_ERROR 0x4000
942 * Magic return value from FindParenCount and GetDecimalValue, to indicate
943 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
944 * its findMax parameter is non-null.
946 #define OVERFLOW_VALUE ((UINT)-1)
949 FindParenCount(CompilerState *state)
954 if (state->flags & JSREG_FIND_PAREN_COUNT)
955 return OVERFLOW_VALUE;
958 * Copy state into temp, flag it so we never report an invalid backref,
959 * and reset its members to parse the entire regexp. This is obviously
960 * suboptimal, but GetDecimalValue calls us only if a backref appears to
961 * refer to a forward parenthetical, which is rare.
964 temp.flags |= JSREG_FIND_PAREN_COUNT;
965 temp.cp = temp.cpbegin;
970 temp.classBitmapsMem = 0;
971 for (i = 0; i < CLASS_CACHE_SIZE; i++)
972 temp.classCache[i].start = NULL;
974 if (!ParseRegExp(&temp)) {
975 state->flags |= JSREG_FIND_PAREN_ERROR;
976 return OVERFLOW_VALUE;
978 return temp.parenCount;
982 * Extract and return a decimal value at state->cp. The initial character c
983 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
984 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
985 * state->flags to discover whether an error occurred under findMax.
988 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
989 CompilerState *state)
991 UINT value = JS7_UNDEC(c);
992 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
994 /* The following restriction allows simpler overflow checks. */
995 assert(max <= ((UINT)-1 - 9) / 10);
996 while (state->cp < state->cpend) {
1000 value = 10 * value + JS7_UNDEC(c);
1001 if (!overflow && value > max && (!findMax || value > findMax(state)))
1005 return overflow ? OVERFLOW_VALUE : value;
1009 * Calculate the total size of the bitmap required for a class expression.
1012 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
1016 BOOL inRange = FALSE;
1017 WCHAR c, rangeStart = 0;
1018 UINT n, digit, nDigits, i;
1020 target->u.ucclass.bmsize = 0;
1021 target->u.ucclass.sense = TRUE;
1028 target->u.ucclass.sense = FALSE;
1031 while (src != end) {
1032 BOOL canStartRange = TRUE;
1059 if (src < end && RE_IS_LETTER(*src)) {
1060 localMax = (UINT) (*src++) & 0x1F;
1073 for (i = 0; (i < nDigits) && (src < end); i++) {
1075 if (!isASCIIHexDigit(c, &digit)) {
1077 * Back off to accepting the original
1084 n = (n << 4) | digit;
1089 canStartRange = FALSE;
1091 JS_ReportErrorNumber(state->context,
1092 js_GetErrorMessage, NULL,
1093 JSMSG_BAD_CLASS_RANGE);
1103 canStartRange = FALSE;
1105 JS_ReportErrorNumber(state->context,
1106 js_GetErrorMessage, NULL,
1107 JSMSG_BAD_CLASS_RANGE);
1113 * If this is the start of a range, ensure that it's less than
1127 * This is a non-ECMA extension - decimal escapes (in this
1128 * case, octal!) are supposed to be an error inside class
1129 * ranges, but supported here for backwards compatibility.
1134 if ('0' <= c && c <= '7') {
1136 n = 8 * n + JS7_UNDEC(c);
1138 if ('0' <= c && c <= '7') {
1140 i = 8 * n + JS7_UNDEC(c);
1161 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1162 if (rangeStart > localMax) {
1163 JS_ReportErrorNumber(state->context,
1164 js_GetErrorMessage, NULL,
1165 JSMSG_BAD_CLASS_RANGE);
1170 if (canStartRange && src < end - 1) {
1174 rangeStart = (WCHAR)localMax;
1178 if (state->flags & JSREG_FOLD)
1179 rangeStart = localMax; /* one run of the uc/dc loop below */
1182 if (state->flags & JSREG_FOLD) {
1183 WCHAR maxch = localMax;
1185 for (i = rangeStart; i <= localMax; i++) {
1201 target->u.ucclass.bmsize = max;
1206 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1210 const WCHAR *errp = state->cp++;
1215 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1218 if (!ignoreValues && min == OVERFLOW_VALUE)
1219 return JSMSG_MIN_TOO_BIG;
1225 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1227 if (!ignoreValues && max == OVERFLOW_VALUE)
1228 return JSMSG_MAX_TOO_BIG;
1229 if (!ignoreValues && min > max)
1230 return JSMSG_OUT_OF_ORDER;
1238 state->result = NewRENode(state, REOP_QUANT);
1240 return JSMSG_OUT_OF_MEMORY;
1241 state->result->u.range.min = min;
1242 state->result->u.range.max = max;
1244 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1245 * where <max> is written as compact(max+1) to make
1246 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1248 state->progLength += (1 + GetCompactIndexWidth(min)
1249 + GetCompactIndexWidth(max + 1)
1260 ParseQuantifier(CompilerState *state)
1263 term = state->result;
1264 if (state->cp < state->cpend) {
1265 switch (*state->cp) {
1267 state->result = NewRENode(state, REOP_QUANT);
1270 state->result->u.range.min = 1;
1271 state->result->u.range.max = (UINT)-1;
1272 /* <PLUS>, <next> ... <ENDCHILD> */
1273 state->progLength += 4;
1276 state->result = NewRENode(state, REOP_QUANT);
1279 state->result->u.range.min = 0;
1280 state->result->u.range.max = (UINT)-1;
1281 /* <STAR>, <next> ... <ENDCHILD> */
1282 state->progLength += 4;
1285 state->result = NewRENode(state, REOP_QUANT);
1288 state->result->u.range.min = 0;
1289 state->result->u.range.max = 1;
1290 /* <OPT>, <next> ... <ENDCHILD> */
1291 state->progLength += 4;
1293 case '{': /* balance '}' */
1297 err = ParseMinMaxQuantifier(state, FALSE);
1303 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1312 if (state->treeDepth == TREE_DEPTH_MAX) {
1313 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1319 state->result->kid = term;
1320 if (state->cp < state->cpend && *state->cp == '?') {
1322 state->result->u.range.greedy = FALSE;
1324 state->result->u.range.greedy = TRUE;
1330 * item: assertion An item is either an assertion or
1331 * quantatom a quantified atom.
1333 * assertion: '^' Assertions match beginning of string
1334 * (or line if the class static property
1335 * RegExp.multiline is true).
1336 * '$' End of string (or line if the class
1337 * static property RegExp.multiline is
1339 * '\b' Word boundary (between \w and \W).
1340 * '\B' Word non-boundary.
1342 * quantatom: atom An unquantified atom.
1343 * quantatom '{' n ',' m '}'
1344 * Atom must occur between n and m times.
1345 * quantatom '{' n ',' '}' Atom must occur at least n times.
1346 * quantatom '{' n '}' Atom must occur exactly n times.
1347 * quantatom '*' Zero or more times (same as {0,}).
1348 * quantatom '+' One or more times (same as {1,}).
1349 * quantatom '?' Zero or one time (same as {0,1}).
1351 * any of which can be optionally followed by '?' for ungreedy
1353 * atom: '(' regexp ')' A parenthesized regexp (what matched
1354 * can be addressed using a backreference,
1356 * '.' Matches any char except '\n'.
1357 * '[' classlist ']' A character class.
1358 * '[' '^' classlist ']' A negated character class.
1360 * '\n' Newline (Line Feed).
1361 * '\r' Carriage Return.
1362 * '\t' Horizontal Tab.
1363 * '\v' Vertical Tab.
1364 * '\d' A digit (same as [0-9]).
1366 * '\w' A word character, [0-9a-z_A-Z].
1367 * '\W' A non-word character.
1368 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1369 * '\S' A non-whitespace character.
1370 * '\' n A backreference to the nth (n decimal
1371 * and positive) parenthesized expression.
1372 * '\' octal An octal escape sequence (octal must be
1373 * two or three digits long, unless it is
1374 * 0 for the null character).
1375 * '\x' hex A hex escape (hex must be two digits).
1376 * '\u' unicode A unicode escape (must be four digits).
1377 * '\c' ctrl A control character, ctrl is a letter.
1378 * '\' literalatomchar Any character except one of the above
1379 * that follow '\' in an atom.
1380 * otheratomchar Any character not first among the other
1381 * atom right-hand sides.
1384 ParseTerm(CompilerState *state)
1386 WCHAR c = *state->cp++;
1388 UINT num, tmp, n, i;
1389 const WCHAR *termStart;
1392 /* assertions and atoms */
1394 state->result = NewRENode(state, REOP_BOL);
1397 state->progLength++;
1400 state->result = NewRENode(state, REOP_EOL);
1403 state->progLength++;
1406 if (state->cp >= state->cpend) {
1407 /* a trailing '\' is an error */
1408 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1413 /* assertion escapes */
1415 state->result = NewRENode(state, REOP_WBDRY);
1418 state->progLength++;
1421 state->result = NewRENode(state, REOP_WNONBDRY);
1424 state->progLength++;
1426 /* Decimal escape */
1428 /* Give a strict warning. See also the note below. */
1429 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1432 while (state->cp < state->cpend) {
1434 if (c < '0' || '7' < c)
1437 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1444 state->result = NewRENode(state, REOP_FLAT);
1447 state->result->u.flat.chr = c;
1448 state->result->u.flat.length = 1;
1449 state->progLength += 3;
1460 termStart = state->cp - 1;
1461 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1462 if (state->flags & JSREG_FIND_PAREN_ERROR)
1464 if (num == OVERFLOW_VALUE) {
1465 /* Give a strict mode warning. */
1466 WARN("back-reference exceeds number of capturing parentheses\n");
1469 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1470 * error here. However, for compatibility with IE, we treat the
1471 * whole backref as flat if the first character in it is not a
1472 * valid octal character, and as an octal escape otherwise.
1474 state->cp = termStart;
1476 /* Treat this as flat. termStart - 1 is the \. */
1481 /* Treat this as an octal escape. */
1484 assert(1 <= num && num <= 0x10000);
1485 state->result = NewRENode(state, REOP_BACKREF);
1488 state->result->u.parenIndex = num - 1;
1490 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1492 /* Control escape */
1508 /* Control letter */
1510 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1511 c = (WCHAR) (*state->cp++ & 0x1F);
1513 /* back off to accepting the original '\' as a literal */
1518 /* HexEscapeSequence */
1522 /* UnicodeEscapeSequence */
1527 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1530 if (!isASCIIHexDigit(c, &digit)) {
1532 * Back off to accepting the original 'u' or 'x' as a
1539 n = (n << 4) | digit;
1543 /* Character class escapes */
1545 state->result = NewRENode(state, REOP_DIGIT);
1549 state->progLength++;
1552 state->result = NewRENode(state, REOP_NONDIGIT);
1555 state->result = NewRENode(state, REOP_SPACE);
1558 state->result = NewRENode(state, REOP_NONSPACE);
1561 state->result = NewRENode(state, REOP_ALNUM);
1564 state->result = NewRENode(state, REOP_NONALNUM);
1566 /* IdentityEscape */
1568 state->result = NewRENode(state, REOP_FLAT);
1571 state->result->u.flat.chr = c;
1572 state->result->u.flat.length = 1;
1573 state->result->kid = (void *) (state->cp - 1);
1574 state->progLength += 3;
1579 state->result = NewRENode(state, REOP_CLASS);
1582 termStart = state->cp;
1583 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1585 if (state->cp == state->cpend) {
1586 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1587 JSMSG_UNTERM_CLASS, termStart);
1591 if (*state->cp == '\\') {
1593 if (state->cp != state->cpend)
1597 if (*state->cp == ']') {
1598 state->result->u.ucclass.kidlen = state->cp - termStart;
1603 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1604 if (!state->classCache[i].start) {
1605 state->classCache[i].start = termStart;
1606 state->classCache[i].length = state->result->u.ucclass.kidlen;
1607 state->classCache[i].index = state->classCount;
1610 if (state->classCache[i].length ==
1611 state->result->u.ucclass.kidlen) {
1612 for (n = 0; ; n++) {
1613 if (n == state->classCache[i].length) {
1614 state->result->u.ucclass.index
1615 = state->classCache[i].index;
1618 if (state->classCache[i].start[n] != termStart[n])
1623 state->result->u.ucclass.index = state->classCount++;
1627 * Call CalculateBitmapSize now as we want any errors it finds
1628 * to be reported during the parse phase, not at execution.
1630 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1633 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1634 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1635 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1637 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1638 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1639 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1642 state->classBitmapsMem += n;
1643 /* CLASS, <index> */
1645 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1649 state->result = NewRENode(state, REOP_DOT);
1654 const WCHAR *errp = state->cp--;
1657 err = ParseMinMaxQuantifier(state, TRUE);
1668 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1669 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1673 state->result = NewRENode(state, REOP_FLAT);
1676 state->result->u.flat.chr = c;
1677 state->result->u.flat.length = 1;
1678 state->result->kid = (void *) (state->cp - 1);
1679 state->progLength += 3;
1682 return ParseQuantifier(state);
1686 * Top-down regular expression grammar, based closely on Perl4.
1688 * regexp: altern A regular expression is one or more
1689 * altern '|' regexp alternatives separated by vertical bar.
1691 #define INITIAL_STACK_SIZE 128
1694 ParseRegExp(CompilerState *state)
1698 REOpData *operatorStack;
1699 RENode **operandStack;
1702 BOOL result = FALSE;
1704 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1705 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1707 /* Watch out for empty regexp */
1708 if (state->cp == state->cpend) {
1709 state->result = NewRENode(state, REOP_EMPTY);
1710 return (state->result != NULL);
1713 operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1717 operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1722 parenIndex = state->parenCount;
1723 if (state->cp == state->cpend) {
1725 * If we are at the end of the regexp and we're short one or more
1726 * operands, the regexp must have the form /x|/ or some such, with
1727 * left parentheses making us short more than one operand.
1729 if (operatorSP >= operandSP) {
1730 operand = NewRENode(state, REOP_EMPTY);
1736 switch (*state->cp) {
1739 if (state->cp + 1 < state->cpend &&
1740 *state->cp == '?' &&
1741 (state->cp[1] == '=' ||
1742 state->cp[1] == '!' ||
1743 state->cp[1] == ':')) {
1744 switch (state->cp[1]) {
1747 /* ASSERT, <next>, ... ASSERTTEST */
1748 state->progLength += 4;
1751 op = REOP_ASSERT_NOT;
1752 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1753 state->progLength += 4;
1756 op = REOP_LPARENNON;
1762 /* LPAREN, <index>, ... RPAREN, <index> */
1764 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1765 state->parenCount++;
1766 if (state->parenCount == 65535) {
1767 ReportRegExpError(state, JSREPORT_ERROR,
1768 JSMSG_TOO_MANY_PARENS);
1776 * If there's no stacked open parenthesis, throw syntax error.
1778 for (i = operatorSP - 1; ; i--) {
1780 ReportRegExpError(state, JSREPORT_ERROR,
1781 JSMSG_UNMATCHED_RIGHT_PAREN);
1784 if (operatorStack[i].op == REOP_ASSERT ||
1785 operatorStack[i].op == REOP_ASSERT_NOT ||
1786 operatorStack[i].op == REOP_LPARENNON ||
1787 operatorStack[i].op == REOP_LPAREN) {
1794 /* Expected an operand before these, so make an empty one */
1795 operand = NewRENode(state, REOP_EMPTY);
1801 if (!ParseTerm(state))
1803 operand = state->result;
1805 if (operandSP == operandStackSize) {
1807 operandStackSize += operandStackSize;
1808 tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1813 operandStack[operandSP++] = operand;
1818 /* At the end; process remaining operators. */
1820 if (state->cp == state->cpend) {
1821 while (operatorSP) {
1823 if (!ProcessOp(state, &operatorStack[operatorSP],
1824 operandStack, operandSP))
1828 assert(operandSP == 1);
1829 state->result = operandStack[0];
1834 switch (*state->cp) {
1836 /* Process any stacked 'concat' operators */
1838 while (operatorSP &&
1839 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1841 if (!ProcessOp(state, &operatorStack[operatorSP],
1842 operandStack, operandSP)) {
1852 * If there's no stacked open parenthesis, throw syntax error.
1854 for (i = operatorSP - 1; ; i--) {
1856 ReportRegExpError(state, JSREPORT_ERROR,
1857 JSMSG_UNMATCHED_RIGHT_PAREN);
1860 if (operatorStack[i].op == REOP_ASSERT ||
1861 operatorStack[i].op == REOP_ASSERT_NOT ||
1862 operatorStack[i].op == REOP_LPARENNON ||
1863 operatorStack[i].op == REOP_LPAREN) {
1869 /* Process everything on the stack until the open parenthesis. */
1873 switch (operatorStack[operatorSP].op) {
1875 case REOP_ASSERT_NOT:
1877 operand = NewRENode(state, operatorStack[operatorSP].op);
1880 operand->u.parenIndex =
1881 operatorStack[operatorSP].parenIndex;
1883 operand->kid = operandStack[operandSP - 1];
1884 operandStack[operandSP - 1] = operand;
1885 if (state->treeDepth == TREE_DEPTH_MAX) {
1886 ReportRegExpError(state, JSREPORT_ERROR,
1887 JSMSG_REGEXP_TOO_COMPLEX);
1893 case REOP_LPARENNON:
1894 state->result = operandStack[operandSP - 1];
1895 if (!ParseQuantifier(state))
1897 operandStack[operandSP - 1] = state->result;
1898 goto restartOperator;
1900 if (!ProcessOp(state, &operatorStack[operatorSP],
1901 operandStack, operandSP))
1911 const WCHAR *errp = state->cp;
1913 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1915 * This didn't even scan correctly as a quantifier, so we should
1929 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1935 /* Anything else is the start of the next term. */
1938 if (operatorSP == operatorStackSize) {
1940 operatorStackSize += operatorStackSize;
1941 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1944 operatorStack = tmp;
1946 operatorStack[operatorSP].op = op;
1947 operatorStack[operatorSP].errPos = state->cp;
1948 operatorStack[operatorSP++].parenIndex = parenIndex;
1953 heap_free(operatorStack);
1954 heap_free(operandStack);
1959 * Save the current state of the match - the position in the input
1960 * text as well as the position in the bytecode. The state of any
1961 * parent expressions is also saved (preceding state).
1962 * Contents of parenCount parentheses from parenIndex are also saved.
1964 static REBackTrackData *
1965 PushBackTrackState(REGlobalData *gData, REOp op,
1966 jsbytecode *target, REMatchState *x, const WCHAR *cp,
1967 size_t parenIndex, size_t parenCount)
1970 REBackTrackData *result =
1971 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1973 size_t sz = sizeof(REBackTrackData) +
1974 gData->stateStackTop * sizeof(REProgState) +
1975 parenCount * sizeof(RECapture);
1977 ptrdiff_t btsize = gData->backTrackStackSize;
1978 ptrdiff_t btincr = ((char *)result + sz) -
1979 ((char *)gData->backTrackStack + btsize);
1981 TRACE("\tBT_Push: %lu,%lu\n", (unsigned long) parenIndex, (unsigned long) parenCount);
1983 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1985 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1987 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1988 btincr = ((btincr+btsize-1)/btsize)*btsize;
1989 gData->backTrackStack = jsheap_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1990 if (!gData->backTrackStack) {
1991 js_ReportOutOfScriptQuota(gData->cx);
1995 gData->backTrackStackSize = btsize + btincr;
1996 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1998 gData->backTrackSP = result;
1999 result->sz = gData->cursz;
2002 result->backtrack_op = op;
2003 result->backtrack_pc = target;
2005 result->parenCount = parenCount;
2006 result->parenIndex = parenIndex;
2008 result->saveStateStackTop = gData->stateStackTop;
2009 assert(gData->stateStackTop);
2010 memcpy(result + 1, gData->stateStack,
2011 sizeof(REProgState) * result->saveStateStackTop);
2013 if (parenCount != 0) {
2014 memcpy((char *)(result + 1) +
2015 sizeof(REProgState) * result->saveStateStackTop,
2016 &x->parens[parenIndex],
2017 sizeof(RECapture) * parenCount);
2018 for (i = 0; i != parenCount; i++)
2019 x->parens[parenIndex + i].index = -1;
2025 static inline REMatchState *
2026 FlatNIMatcher(REGlobalData *gData, REMatchState *x, WCHAR *matchChars,
2030 assert(gData->cpend >= x->cp);
2031 if (length > (size_t)(gData->cpend - x->cp))
2033 for (i = 0; i != length; i++) {
2034 if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
2042 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2043 * 2. If E is not a character then go to step 6.
2044 * 3. Let ch be E's character.
2045 * 4. Let A be a one-element RECharSet containing the character ch.
2046 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2047 * 6. E must be an integer. Let n be that integer.
2048 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2049 * 8. Return an internal Matcher closure that takes two arguments, a State x
2050 * and a Continuation c, and performs the following:
2051 * 1. Let cap be x's captures internal array.
2052 * 2. Let s be cap[n].
2053 * 3. If s is undefined, then call c(x) and return its result.
2054 * 4. Let e be x's endIndex.
2055 * 5. Let len be s's length.
2056 * 6. Let f be e+len.
2057 * 7. If f>InputLength, return failure.
2058 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2059 * such that Canonicalize(s[i]) is not the same character as
2060 * Canonicalize(Input [e+i]), then return failure.
2061 * 9. Let y be the State (f, cap).
2062 * 10. Call c(y) and return its result.
2064 static REMatchState *
2065 BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
2068 const WCHAR *parenContent;
2069 RECapture *cap = &x->parens[parenIndex];
2071 if (cap->index == -1)
2075 if (x->cp + len > gData->cpend)
2078 parenContent = &gData->cpbegin[cap->index];
2079 if (gData->regexp->flags & JSREG_FOLD) {
2080 for (i = 0; i < len; i++) {
2081 if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2085 for (i = 0; i < len; i++) {
2086 if (parenContent[i] != x->cp[i])
2094 /* Add a single character to the RECharSet */
2096 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2098 UINT byteIndex = (UINT)(c >> 3);
2099 assert(c <= cs->length);
2100 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2104 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2106 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2110 UINT byteIndex1 = c1 >> 3;
2111 UINT byteIndex2 = c2 >> 3;
2113 assert(c2 <= cs->length && c1 <= c2);
2118 if (byteIndex1 == byteIndex2) {
2119 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2121 cs->u.bits[byteIndex1] |= 0xFF << c1;
2122 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2123 cs->u.bits[i] = 0xFF;
2124 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2128 /* Compile the source of the class into a RECharSet */
2130 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2132 const WCHAR *src, *end;
2133 BOOL inRange = FALSE;
2134 WCHAR rangeStart = 0;
2139 assert(!charSet->converted);
2141 * Assert that startIndex and length points to chars inside [] inside
2144 assert(1 <= charSet->u.src.startIndex);
2145 assert(charSet->u.src.startIndex
2146 < SysStringLen(gData->regexp->source));
2147 assert(charSet->u.src.length <= SysStringLen(gData->regexp->source)
2148 - 1 - charSet->u.src.startIndex);
2150 charSet->converted = TRUE;
2151 src = gData->regexp->source + charSet->u.src.startIndex;
2153 end = src + charSet->u.src.length;
2155 assert(src[-1] == '[' && end[0] == ']');
2157 byteLength = (charSet->length >> 3) + 1;
2158 charSet->u.bits = heap_alloc(byteLength);
2159 if (!charSet->u.bits) {
2160 JS_ReportOutOfMemory(gData->cx);
2164 memset(charSet->u.bits, 0, byteLength);
2170 assert(charSet->sense == FALSE);
2173 assert(charSet->sense == TRUE);
2176 while (src != end) {
2201 if (src < end && JS_ISWORD(*src)) {
2202 thisCh = (WCHAR)(*src++ & 0x1F);
2215 for (i = 0; (i < nDigits) && (src < end); i++) {
2218 if (!isASCIIHexDigit(c, &digit)) {
2220 * Back off to accepting the original '\'
2227 n = (n << 4) | digit;
2240 * This is a non-ECMA extension - decimal escapes (in this
2241 * case, octal!) are supposed to be an error inside class
2242 * ranges, but supported here for backwards compatibility.
2246 if ('0' <= c && c <= '7') {
2248 n = 8 * n + JS7_UNDEC(c);
2250 if ('0' <= c && c <= '7') {
2252 i = 8 * n + JS7_UNDEC(c);
2263 AddCharacterRangeToCharSet(charSet, '0', '9');
2264 continue; /* don't need range processing */
2266 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2267 AddCharacterRangeToCharSet(charSet,
2269 (WCHAR)charSet->length);
2272 for (i = (INT)charSet->length; i >= 0; i--)
2274 AddCharacterToCharSet(charSet, (WCHAR)i);
2277 for (i = (INT)charSet->length; i >= 0; i--)
2279 AddCharacterToCharSet(charSet, (WCHAR)i);
2282 for (i = (INT)charSet->length; i >= 0; i--)
2284 AddCharacterToCharSet(charSet, (WCHAR)i);
2287 for (i = (INT)charSet->length; i >= 0; i--)
2289 AddCharacterToCharSet(charSet, (WCHAR)i);
2304 if (gData->regexp->flags & JSREG_FOLD) {
2307 assert(rangeStart <= thisCh);
2308 for (i = rangeStart; i <= thisCh; i++) {
2311 AddCharacterToCharSet(charSet, i);
2315 AddCharacterToCharSet(charSet, uch);
2317 AddCharacterToCharSet(charSet, dch);
2320 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2324 if (gData->regexp->flags & JSREG_FOLD) {
2325 AddCharacterToCharSet(charSet, toupperW(thisCh));
2326 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2328 AddCharacterToCharSet(charSet, thisCh);
2330 if (src < end - 1) {
2334 rangeStart = thisCh;
2343 ReallocStateStack(REGlobalData *gData)
2345 size_t limit = gData->stateStackLimit;
2346 size_t sz = sizeof(REProgState) * limit;
2348 gData->stateStack = jsheap_grow(gData->pool, gData->stateStack, sz, sz);
2349 if (!gData->stateStack) {
2350 js_ReportOutOfScriptQuota(gData->cx);
2354 gData->stateStackLimit = limit + limit;
2358 #define PUSH_STATE_STACK(data) \
2360 ++(data)->stateStackTop; \
2361 if ((data)->stateStackTop == (data)->stateStackLimit && \
2362 !ReallocStateStack((data))) { \
2368 * Apply the current op against the given input to see if it's going to match
2369 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2370 * true, then update the current state's cp. Always update startpc to the next
2373 static inline REMatchState *
2374 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
2375 jsbytecode **startpc, BOOL updatecp)
2377 REMatchState *result = NULL;
2380 size_t offset, length, index;
2381 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2383 const WCHAR *startcp = x->cp;
2387 const char *opname = reop_names[op];
2388 TRACE("\n%06d: %*s%s\n", pc - gData->regexp->program,
2389 gData->stateStackTop * 2, "", opname);
2396 if (x->cp != gData->cpbegin) {
2397 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2398 !(gData->regexp->flags & JSREG_MULTILINE)) {
2401 if (!RE_IS_LINE_TERM(x->cp[-1]))
2407 if (x->cp != gData->cpend) {
2408 if (/*!gData->cx->regExpStatics.multiline &&*/
2409 !(gData->regexp->flags & JSREG_MULTILINE)) {
2412 if (!RE_IS_LINE_TERM(*x->cp))
2418 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2419 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2424 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2425 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2430 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2436 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2442 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2448 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2454 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2460 if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2466 if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2472 pc = ReadCompactIndex(pc, &parenIndex);
2473 assert(parenIndex < gData->regexp->parenCount);
2474 result = BackrefMatcher(gData, x, parenIndex);
2477 pc = ReadCompactIndex(pc, &offset);
2478 assert(offset < SysStringLen(gData->regexp->source));
2479 pc = ReadCompactIndex(pc, &length);
2480 assert(1 <= length);
2481 assert(length <= SysStringLen(gData->regexp->source) - offset);
2482 if (length <= (size_t)(gData->cpend - x->cp)) {
2483 source = gData->regexp->source + offset;
2484 TRACE("%s\n", debugstr_wn(source, length));
2485 for (index = 0; index != length; index++) {
2486 if (source[index] != x->cp[index])
2495 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2496 if (x->cp != gData->cpend && *x->cp == matchCh) {
2502 pc = ReadCompactIndex(pc, &offset);
2503 assert(offset < SysStringLen(gData->regexp->source));
2504 pc = ReadCompactIndex(pc, &length);
2505 assert(1 <= length);
2506 assert(length <= SysStringLen(gData->regexp->source) - offset);
2507 source = gData->regexp->source;
2508 result = FlatNIMatcher(gData, x, source + offset, length);
2512 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2518 matchCh = GET_ARG(pc);
2519 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2521 if (x->cp != gData->cpend && *x->cp == matchCh) {
2527 matchCh = GET_ARG(pc);
2529 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2535 pc = ReadCompactIndex(pc, &index);
2536 assert(index < gData->regexp->classCount);
2537 if (x->cp != gData->cpend) {
2538 charSet = &gData->regexp->classList[index];
2539 assert(charSet->converted);
2542 if (charSet->length != 0 &&
2543 ch <= charSet->length &&
2544 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2551 pc = ReadCompactIndex(pc, &index);
2552 assert(index < gData->regexp->classCount);
2553 if (x->cp != gData->cpend) {
2554 charSet = &gData->regexp->classList[index];
2555 assert(charSet->converted);
2558 if (charSet->length == 0 ||
2559 ch > charSet->length ||
2560 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2581 static inline REMatchState *
2582 ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
2584 REMatchState *result = NULL;
2585 REBackTrackData *backTrackData;
2586 jsbytecode *nextpc, *testpc;
2589 REProgState *curState;
2590 const WCHAR *startcp;
2591 size_t parenIndex, k;
2592 size_t parenSoFar = 0;
2594 WCHAR matchCh1, matchCh2;
2598 jsbytecode *pc = gData->regexp->program;
2599 REOp op = (REOp) *pc++;
2602 * If the first node is a simple match, step the index into the string
2603 * until that match is made, or fail if it can't be found at all.
2605 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) {
2607 while (x->cp <= gData->cpend) {
2608 nextpc = pc; /* reset back to start each time */
2609 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2613 pc = nextpc; /* accept skip to next opcode */
2615 assert(op < REOP_LIMIT);
2626 const char *opname = reop_names[op];
2627 TRACE("\n%06d: %*s%s\n", pc - gData->regexp->program,
2628 gData->stateStackTop * 2, "", opname);
2630 if (REOP_IS_SIMPLE(op)) {
2631 result = SimpleMatch(gData, x, op, &pc, TRUE);
2633 curState = &gData->stateStack[gData->stateStackTop];
2637 case REOP_ALTPREREQ2:
2638 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2640 matchCh2 = GET_ARG(pc);
2645 if (x->cp != gData->cpend) {
2646 if (*x->cp == matchCh2)
2649 charSet = &gData->regexp->classList[k];
2650 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2654 if ((charSet->length == 0 ||
2655 matchCh1 > charSet->length ||
2656 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2664 case REOP_ALTPREREQ:
2665 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2667 matchCh1 = GET_ARG(pc);
2669 matchCh2 = GET_ARG(pc);
2671 if (x->cp == gData->cpend ||
2672 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2676 /* else false thru... */
2680 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2681 pc += ARG_LEN; /* start of this alternate */
2682 curState->parenSoFar = parenSoFar;
2683 PUSH_STATE_STACK(gData);
2686 if (REOP_IS_SIMPLE(op)) {
2687 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2688 op = (REOp) *nextpc++;
2695 nextop = (REOp) *nextpc++;
2696 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2701 * Occurs at (successful) end of REOP_ALT,
2705 * If we have not gotten a result here, it is because of an
2706 * empty match. Do the same thing REOP_EMPTY would do.
2711 --gData->stateStackTop;
2712 pc += GET_OFFSET(pc);
2717 * Occurs at last (successful) end of REOP_ALT,
2721 * If we have not gotten a result here, it is because of an
2722 * empty match. Do the same thing REOP_EMPTY would do.
2727 --gData->stateStackTop;
2732 pc = ReadCompactIndex(pc, &parenIndex);
2733 TRACE("[ %lu ]\n", (unsigned long) parenIndex);
2734 assert(parenIndex < gData->regexp->parenCount);
2735 if (parenIndex + 1 > parenSoFar)
2736 parenSoFar = parenIndex + 1;
2737 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2738 x->parens[parenIndex].length = 0;
2746 pc = ReadCompactIndex(pc, &parenIndex);
2747 assert(parenIndex < gData->regexp->parenCount);
2748 cap = &x->parens[parenIndex];
2749 delta = x->cp - (gData->cpbegin + cap->index);
2750 cap->length = (delta < 0) ? 0 : (size_t) delta;
2758 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2759 pc += ARG_LEN; /* start of ASSERT child */
2762 if (REOP_IS_SIMPLE(op) &&
2763 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2767 curState->u.assertion.top =
2768 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2769 curState->u.assertion.sz = gData->cursz;
2770 curState->index = x->cp - gData->cpbegin;
2771 curState->parenSoFar = parenSoFar;
2772 PUSH_STATE_STACK(gData);
2773 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2774 nextpc, x, x->cp, 0, 0)) {
2779 case REOP_ASSERT_NOT:
2780 nextpc = pc + GET_OFFSET(pc);
2784 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2785 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2786 *testpc == REOP_ASSERTNOTTEST) {
2790 curState->u.assertion.top
2791 = (char *)gData->backTrackSP -
2792 (char *)gData->backTrackStack;
2793 curState->u.assertion.sz = gData->cursz;
2794 curState->index = x->cp - gData->cpbegin;
2795 curState->parenSoFar = parenSoFar;
2796 PUSH_STATE_STACK(gData);
2797 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2798 nextpc, x, x->cp, 0, 0)) {
2803 case REOP_ASSERTTEST:
2804 --gData->stateStackTop;
2806 x->cp = gData->cpbegin + curState->index;
2807 gData->backTrackSP =
2808 (REBackTrackData *) ((char *)gData->backTrackStack +
2809 curState->u.assertion.top);
2810 gData->cursz = curState->u.assertion.sz;
2815 case REOP_ASSERTNOTTEST:
2816 --gData->stateStackTop;
2818 x->cp = gData->cpbegin + curState->index;
2819 gData->backTrackSP =
2820 (REBackTrackData *) ((char *)gData->backTrackStack +
2821 curState->u.assertion.top);
2822 gData->cursz = curState->u.assertion.sz;
2823 result = (!result) ? x : NULL;
2826 curState->u.quantifier.min = 0;
2827 curState->u.quantifier.max = (UINT)-1;
2830 curState->u.quantifier.min = 1;
2831 curState->u.quantifier.max = (UINT)-1;
2834 curState->u.quantifier.min = 0;
2835 curState->u.quantifier.max = 1;
2838 pc = ReadCompactIndex(pc, &k);
2839 curState->u.quantifier.min = k;
2840 pc = ReadCompactIndex(pc, &k);
2841 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2842 curState->u.quantifier.max = k - 1;
2843 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2845 if (curState->u.quantifier.max == 0) {
2846 pc = pc + GET_OFFSET(pc);
2851 /* Step over <next> */
2852 nextpc = pc + ARG_LEN;
2853 op = (REOp) *nextpc++;
2855 if (REOP_IS_SIMPLE(op)) {
2856 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2857 if (curState->u.quantifier.min == 0)
2861 pc = pc + GET_OFFSET(pc);
2864 op = (REOp) *nextpc++;
2867 curState->index = startcp - gData->cpbegin;
2868 curState->continue_op = REOP_REPEAT;
2869 curState->continue_pc = pc;
2870 curState->parenSoFar = parenSoFar;
2871 PUSH_STATE_STACK(gData);
2872 if (curState->u.quantifier.min == 0 &&
2873 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2880 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2881 pc = curState[-1].continue_pc;
2882 op = (REOp) curState[-1].continue_op;
2891 --gData->stateStackTop;
2893 /* Failed, see if we have enough children. */
2894 if (curState->u.quantifier.min == 0)
2898 if (curState->u.quantifier.min == 0 &&
2899 x->cp == gData->cpbegin + curState->index) {
2900 /* matched an empty string, that'll get us nowhere */
2904 if (curState->u.quantifier.min != 0)
2905 curState->u.quantifier.min--;
2906 if (curState->u.quantifier.max != (UINT) -1)
2907 curState->u.quantifier.max--;
2908 if (curState->u.quantifier.max == 0)
2910 nextpc = pc + ARG_LEN;
2911 nextop = (REOp) *nextpc;
2913 if (REOP_IS_SIMPLE(nextop)) {
2915 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2916 if (curState->u.quantifier.min == 0)
2923 curState->index = startcp - gData->cpbegin;
2924 PUSH_STATE_STACK(gData);
2925 if (curState->u.quantifier.min == 0 &&
2926 !PushBackTrackState(gData, REOP_REPEAT,
2928 curState->parenSoFar,
2930 curState->parenSoFar)) {
2933 } while (*nextpc == REOP_ENDCHILD);
2936 parenSoFar = curState->parenSoFar;
2941 pc += GET_OFFSET(pc);
2944 case REOP_MINIMALSTAR:
2945 curState->u.quantifier.min = 0;
2946 curState->u.quantifier.max = (UINT)-1;
2947 goto minimalquantcommon;
2948 case REOP_MINIMALPLUS:
2949 curState->u.quantifier.min = 1;
2950 curState->u.quantifier.max = (UINT)-1;
2951 goto minimalquantcommon;
2952 case REOP_MINIMALOPT:
2953 curState->u.quantifier.min = 0;
2954 curState->u.quantifier.max = 1;
2955 goto minimalquantcommon;
2956 case REOP_MINIMALQUANT:
2957 pc = ReadCompactIndex(pc, &k);
2958 curState->u.quantifier.min = k;
2959 pc = ReadCompactIndex(pc, &k);
2960 /* See REOP_QUANT comments about k - 1. */
2961 curState->u.quantifier.max = k - 1;
2962 assert(curState->u.quantifier.min
2963 <= curState->u.quantifier.max);
2965 curState->index = x->cp - gData->cpbegin;
2966 curState->parenSoFar = parenSoFar;
2967 PUSH_STATE_STACK(gData);
2968 if (curState->u.quantifier.min != 0) {
2969 curState->continue_op = REOP_MINIMALREPEAT;
2970 curState->continue_pc = pc;
2971 /* step over <next> */
2975 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2976 pc, x, x->cp, 0, 0)) {
2979 --gData->stateStackTop;
2980 pc = pc + GET_OFFSET(pc);
2985 case REOP_MINIMALREPEAT:
2986 --gData->stateStackTop;
2989 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2990 #define PREPARE_REPEAT() \
2992 curState->index = x->cp - gData->cpbegin; \
2993 curState->continue_op = REOP_MINIMALREPEAT; \
2994 curState->continue_pc = pc; \
2996 for (k = curState->parenSoFar; k < parenSoFar; k++) \
2997 x->parens[k].index = -1; \
2998 PUSH_STATE_STACK(gData); \
2999 op = (REOp) *pc++; \
3000 assert(op < REOP_LIMIT); \
3006 * Non-greedy failure - try to consume another child.
3008 if (curState->u.quantifier.max == (UINT) -1 ||
3009 curState->u.quantifier.max > 0) {
3013 /* Don't need to adjust pc since we're going to pop. */
3016 if (curState->u.quantifier.min == 0 &&
3017 x->cp == gData->cpbegin + curState->index) {
3018 /* Matched an empty string, that'll get us nowhere. */
3022 if (curState->u.quantifier.min != 0)
3023 curState->u.quantifier.min--;
3024 if (curState->u.quantifier.max != (UINT) -1)
3025 curState->u.quantifier.max--;
3026 if (curState->u.quantifier.min != 0) {
3030 curState->index = x->cp - gData->cpbegin;
3031 curState->parenSoFar = parenSoFar;
3032 PUSH_STATE_STACK(gData);
3033 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
3035 curState->parenSoFar,
3036 parenSoFar - curState->parenSoFar)) {
3039 --gData->stateStackTop;
3040 pc = pc + GET_OFFSET(pc);
3042 assert(op < REOP_LIMIT);
3052 * If the match failed and there's a backtrack option, take it.
3053 * Otherwise this is a complete and utter failure.
3056 if (gData->cursz == 0)
3059 /* Potentially detect explosive regex here. */
3060 gData->backTrackCount++;
3061 if (gData->backTrackLimit &&
3062 gData->backTrackCount >= gData->backTrackLimit) {
3063 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3064 JSMSG_REGEXP_TOO_COMPLEX);
3069 backTrackData = gData->backTrackSP;
3070 gData->cursz = backTrackData->sz;
3071 gData->backTrackSP =
3072 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3073 x->cp = backTrackData->cp;
3074 pc = backTrackData->backtrack_pc;
3075 op = (REOp) backTrackData->backtrack_op;
3076 assert(op < REOP_LIMIT);
3077 gData->stateStackTop = backTrackData->saveStateStackTop;
3078 assert(gData->stateStackTop);
3080 memcpy(gData->stateStack, backTrackData + 1,
3081 sizeof(REProgState) * backTrackData->saveStateStackTop);
3082 curState = &gData->stateStack[gData->stateStackTop - 1];
3084 if (backTrackData->parenCount) {
3085 memcpy(&x->parens[backTrackData->parenIndex],
3086 (char *)(backTrackData + 1) +
3087 sizeof(REProgState) * backTrackData->saveStateStackTop,
3088 sizeof(RECapture) * backTrackData->parenCount);
3089 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3091 for (k = curState->parenSoFar; k < parenSoFar; k++)
3092 x->parens[k].index = -1;
3093 parenSoFar = curState->parenSoFar;
3096 TRACE("\tBT_Pop: %ld,%ld\n",
3097 (unsigned long) backTrackData->parenIndex,
3098 (unsigned long) backTrackData->parenCount);
3104 * Continue with the expression.
3107 assert(op < REOP_LIMIT);
3119 static REMatchState *MatchRegExp(REGlobalData *gData, REMatchState *x)
3121 REMatchState *result;
3122 const WCHAR *cp = x->cp;
3127 * Have to include the position beyond the last character
3128 * in order to detect end-of-input/line condition.
3130 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3131 gData->skipped = cp2 - cp;
3133 for (j = 0; j < gData->regexp->parenCount; j++)
3134 x->parens[j].index = -1;
3135 result = ExecuteREBytecode(gData, x);
3136 if (!gData->ok || result || (gData->regexp->flags & JSREG_STICKY))
3138 gData->backTrackSP = gData->backTrackStack;
3140 gData->stateStackTop = 0;
3141 cp2 = cp + gData->skipped;
3146 #define MIN_BACKTRACK_LIMIT 400000
3148 static REMatchState *InitMatch(script_ctx_t *cx, REGlobalData *gData, JSRegExp *re, size_t length)
3150 REMatchState *result;
3153 gData->backTrackStackSize = INITIAL_BACKTRACK;
3154 gData->backTrackStack = jsheap_alloc(gData->pool, INITIAL_BACKTRACK);
3155 if (!gData->backTrackStack)
3158 gData->backTrackSP = gData->backTrackStack;
3160 gData->backTrackCount = 0;
3161 gData->backTrackLimit = 0;
3163 gData->stateStackLimit = INITIAL_STATESTACK;
3164 gData->stateStack = jsheap_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3165 if (!gData->stateStack)
3168 gData->stateStackTop = 0;
3173 result = jsheap_alloc(gData->pool, offsetof(REMatchState, parens) + re->parenCount * sizeof(RECapture));
3177 for (i = 0; i < re->classCount; i++) {
3178 if (!re->classList[i].converted &&
3179 !ProcessCharSet(gData, &re->classList[i])) {
3187 js_ReportOutOfScriptQuota(cx);
3193 js_DestroyRegExp(JSRegExp *re)
3195 if (re->classList) {
3197 for (i = 0; i < re->classCount; i++) {
3198 if (re->classList[i].converted)
3199 heap_free(re->classList[i].u.bits);
3200 re->classList[i].u.bits = NULL;
3202 heap_free(re->classList);
3208 js_NewRegExp(script_ctx_t *cx, BSTR str, UINT flags, BOOL flat)
3212 CompilerState state;
3219 mark = jsheap_mark(&cx->tmp_heap);
3220 len = SysStringLen(str);
3226 state.cpbegin = state.cp;
3227 state.cpend = state.cp + len;
3228 state.flags = flags;
3229 state.parenCount = 0;
3230 state.classCount = 0;
3231 state.progLength = 0;
3232 state.treeDepth = 0;
3233 state.classBitmapsMem = 0;
3234 for (i = 0; i < CLASS_CACHE_SIZE; i++)
3235 state.classCache[i].start = NULL;
3237 if (len != 0 && flat) {
3238 state.result = NewRENode(&state, REOP_FLAT);
3241 state.result->u.flat.chr = *state.cpbegin;
3242 state.result->u.flat.length = len;
3243 state.result->kid = (void *) state.cpbegin;
3244 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3245 state.progLength += 1 + GetCompactIndexWidth(0)
3246 + GetCompactIndexWidth(len);
3248 if (!ParseRegExp(&state))
3251 resize = offsetof(JSRegExp, program) + state.progLength + 1;
3252 re = heap_alloc(resize);
3256 assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3257 re->classCount = state.classCount;
3258 if (re->classCount) {
3259 re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3260 if (!re->classList) {
3261 js_DestroyRegExp(re);
3265 for (i = 0; i < re->classCount; i++)
3266 re->classList[i].converted = FALSE;
3268 re->classList = NULL;
3270 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3272 js_DestroyRegExp(re);
3276 *endPC++ = REOP_END;
3278 * Check whether size was overestimated and shrink using realloc.
3279 * This is safe since no pointers to newly parsed regexp or its parts
3280 * besides re exist here.
3282 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3284 assert((size_t)(endPC - re->program) < state.progLength + 1);
3285 resize = offsetof(JSRegExp, program) + (endPC - re->program);
3286 tmp = heap_realloc(re, resize);
3292 re->parenCount = state.parenCount;
3300 static HRESULT do_regexp_match_next(RegExpInstance *regexp, const WCHAR *str, DWORD len,
3301 const WCHAR **cp, match_result_t **parens, DWORD *parens_size, DWORD *parens_cnt, match_result_t *ret)
3303 REMatchState *x, *result;
3307 gData.cpbegin = *cp;
3308 gData.cpend = str + len;
3309 gData.start = *cp-str;
3311 gData.pool = ®exp->dispex.ctx->tmp_heap;
3313 x = InitMatch(NULL, &gData, regexp->jsregexp, gData.cpend - gData.cpbegin);
3315 WARN("InitMatch failed\n");
3320 result = MatchRegExp(&gData, x);
3322 WARN("MatchRegExp failed\n");
3332 if(regexp->jsregexp->parenCount > *parens_size) {
3333 match_result_t *new_parens;
3336 new_parens = heap_realloc(*parens, sizeof(match_result_t)*regexp->jsregexp->parenCount);
3338 new_parens = heap_alloc(sizeof(match_result_t)*regexp->jsregexp->parenCount);
3340 return E_OUTOFMEMORY;
3342 *parens = new_parens;
3345 *parens_cnt = regexp->jsregexp->parenCount;
3347 for(i=0; i < regexp->jsregexp->parenCount; i++) {
3348 (*parens)[i].str = *cp + result->parens[i].index;
3349 (*parens)[i].len = result->parens[i].length;
3353 matchlen = (result->cp-*cp) - gData.skipped;
3355 ret->str = result->cp-matchlen;
3356 ret->len = matchlen;
3361 HRESULT regexp_match_next(DispatchEx *dispex, BOOL gcheck, const WCHAR *str, DWORD len,
3362 const WCHAR **cp, match_result_t **parens, DWORD *parens_size, DWORD *parens_cnt, match_result_t *ret)
3364 RegExpInstance *regexp = (RegExpInstance*)dispex;
3368 if(gcheck && !(regexp->jsregexp->flags & JSREG_GLOB))
3371 mark = jsheap_mark(®exp->dispex.ctx->tmp_heap);
3373 hres = do_regexp_match_next(regexp, str, len, cp, parens, parens_size, parens_cnt, ret);
3379 HRESULT regexp_match(DispatchEx *dispex, const WCHAR *str, DWORD len, BOOL gflag, match_result_t **match_result,
3382 RegExpInstance *This = (RegExpInstance*)dispex;
3383 match_result_t *ret = NULL, cres;
3384 const WCHAR *cp = str;
3385 DWORD i=0, ret_size = 0;
3389 mark = jsheap_mark(&This->dispex.ctx->tmp_heap);
3392 hres = do_regexp_match_next(This, str, len, &cp, NULL, NULL, NULL, &cres);
3393 if(hres == S_FALSE) {
3403 ret = heap_realloc(ret, (ret_size <<= 1) * sizeof(match_result_t));
3405 ret = heap_alloc((ret_size=4) * sizeof(match_result_t));
3407 hres = E_OUTOFMEMORY;
3414 if(!gflag && !(This->jsregexp->flags & JSREG_GLOB)) {
3426 *match_result = ret;
3431 static HRESULT RegExp_source(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3432 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3438 static HRESULT RegExp_global(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3439 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3445 static HRESULT RegExp_ignoreCase(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3446 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3452 static HRESULT RegExp_multiline(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3453 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3459 static HRESULT RegExp_lastIndex(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3460 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3466 static HRESULT RegExp_toString(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3467 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3473 static HRESULT RegExp_toLocaleString(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3474 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3480 static HRESULT RegExp_hasOwnProperty(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3481 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3487 static HRESULT RegExp_propertyIsEnumerable(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3488 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3494 static HRESULT RegExp_isPrototypeOf(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3495 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3501 static HRESULT RegExp_exec(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3502 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3508 static HRESULT RegExp_value(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3509 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3515 static void RegExp_destructor(DispatchEx *dispex)
3517 RegExpInstance *This = (RegExpInstance*)dispex;
3520 js_DestroyRegExp(This->jsregexp);
3521 SysFreeString(This->str);
3525 static const builtin_prop_t RegExp_props[] = {
3526 {execW, RegExp_exec, PROPF_METHOD},
3527 {globalW, RegExp_global, 0},
3528 {hasOwnPropertyW, RegExp_hasOwnProperty, PROPF_METHOD},
3529 {ignoreCaseW, RegExp_ignoreCase, 0},
3530 {isPrototypeOfW, RegExp_isPrototypeOf, PROPF_METHOD},
3531 {lastIndexW, RegExp_lastIndex, 0},
3532 {multilineW, RegExp_multiline, 0},
3533 {propertyIsEnumerableW, RegExp_propertyIsEnumerable, PROPF_METHOD},
3534 {sourceW, RegExp_source, 0},
3535 {toLocaleStringW, RegExp_toLocaleString, PROPF_METHOD},
3536 {toStringW, RegExp_toString, PROPF_METHOD}
3539 static const builtin_info_t RegExp_info = {
3541 {NULL, RegExp_value, 0},
3542 sizeof(RegExp_props)/sizeof(*RegExp_props),
3548 static HRESULT alloc_regexp(script_ctx_t *ctx, BOOL use_constr, RegExpInstance **ret)
3550 RegExpInstance *regexp;
3553 regexp = heap_alloc_zero(sizeof(RegExpInstance));
3555 return E_OUTOFMEMORY;
3558 hres = init_dispex_from_constr(®exp->dispex, ctx, &RegExp_info, ctx->regexp_constr);
3560 hres = init_dispex(®exp->dispex, ctx, &RegExp_info, NULL);
3571 static HRESULT create_regexp(script_ctx_t *ctx, const WCHAR *exp, int len, DWORD flags, DispatchEx **ret)
3573 RegExpInstance *regexp;
3576 TRACE("%s %x\n", debugstr_w(exp), flags);
3578 hres = alloc_regexp(ctx, TRUE, ®exp);
3583 regexp->str = SysAllocString(exp);
3585 regexp->str = SysAllocStringLen(exp, len);
3587 jsdisp_release(®exp->dispex);
3588 return E_OUTOFMEMORY;
3591 regexp->jsregexp = js_NewRegExp(ctx, regexp->str, flags, FALSE);
3592 if(!regexp->jsregexp) {
3593 WARN("js_NewRegExp failed\n");
3594 jsdisp_release(®exp->dispex);
3598 *ret = ®exp->dispex;
3602 static HRESULT regexp_constructor(script_ctx_t *ctx, DISPPARAMS *dp, VARIANT *retv)
3604 const WCHAR *opt = emptyW, *src;
3614 arg = get_arg(dp,0);
3615 if(V_VT(arg) == VT_DISPATCH) {
3618 obj = iface_to_jsdisp((IUnknown*)V_DISPATCH(arg));
3620 if(is_class(obj, JSCLASS_REGEXP)) {
3621 RegExpInstance *regexp = (RegExpInstance*)obj;
3623 hres = create_regexp(ctx, regexp->str, -1, regexp->jsregexp->flags, &ret);
3624 jsdisp_release(obj);
3628 V_VT(retv) = VT_DISPATCH;
3629 V_DISPATCH(retv) = (IDispatch*)_IDispatchEx_(ret);
3633 jsdisp_release(obj);
3637 if(V_VT(arg) != VT_BSTR) {
3638 FIXME("vt arg0 = %d\n", V_VT(arg));
3644 if(arg_cnt(dp) >= 2) {
3645 arg = get_arg(dp,1);
3646 if(V_VT(arg) != VT_BSTR) {
3647 FIXME("unimplemented for vt %d\n", V_VT(arg));
3654 hres = create_regexp_str(ctx, src, -1, opt, strlenW(opt), &ret);
3658 V_VT(retv) = VT_DISPATCH;
3659 V_DISPATCH(retv) = (IDispatch*)_IDispatchEx_(ret);
3663 static HRESULT RegExpConstr_value(DispatchEx *dispex, LCID lcid, WORD flags, DISPPARAMS *dp,
3664 VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
3669 case DISPATCH_CONSTRUCT:
3670 return regexp_constructor(dispex->ctx, dp, retv);
3672 FIXME("unimplemented flags: %x\n", flags);
3679 HRESULT create_regexp_constr(script_ctx_t *ctx, DispatchEx **ret)
3681 RegExpInstance *regexp;
3684 hres = alloc_regexp(ctx, FALSE, ®exp);
3688 hres = create_builtin_function(ctx, RegExpConstr_value, PROPF_CONSTR, ®exp->dispex, ret);
3690 jsdisp_release(®exp->dispex);
3694 HRESULT create_regexp_str(script_ctx_t *ctx, const WCHAR *exp, DWORD exp_len, const WCHAR *opt,
3695 DWORD opt_len, DispatchEx **ret)
3701 for (p = opt; p < opt+opt_len; p++) {
3704 flags |= JSREG_GLOB;
3707 flags |= JSREG_FOLD;
3710 flags |= JSREG_MULTILINE;
3713 flags |= JSREG_STICKY;
3716 WARN("wrong flag %c\n", *p);
3722 return create_regexp(ctx, exp, exp_len, flags, ret);