/* Implementation of backtracking std.regex engine. Contains both compile-time and run-time versions. */ module std.regex.internal.backtracking; package(std.regex): import core.stdc.stdlib, std.range.primitives, std.traits, std.typecons; import std.regex.internal.ir; import core.memory : pureMalloc, pureFree; /+ BacktrackingMatcher implements backtracking scheme of matching regular expressions. +/ @trusted class BacktrackingMatcher(Char, Stream = Input!Char) : Matcher!Char if (is(Char : dchar)) { alias DataIndex = Stream.DataIndex; struct State {//top bit in pc is set if saved along with matches DataIndex index; uint pc, counter, infiniteNesting; } static assert(State.sizeof % size_t.sizeof == 0); enum stateSize = State.sizeof / size_t.sizeof; enum initialStack = 1 << 11; // items in a block of segmented stack alias String = const(Char)[]; alias RegEx = Regex!Char; alias MatchFn = bool function(BacktrackingMatcher) pure; const RegEx re; // regex program MatchFn nativeFn; // native code for that program // Stream state Stream s; DataIndex index; dchar front; bool exhausted; // Backtracking machine state uint pc, counter; DataIndex lastState = 0; // Top of state stack uint infiniteNesting; size_t[] memory; Trace[] merge; static struct Trace { ulong mask; size_t offset; bool mark(size_t idx) { immutable d = idx - offset; if (d < 64) // including overflow { immutable p = mask & (1UL << d); mask |= 1UL << d; return p != 0; } else { offset = idx; mask = 1; return false; } } } //local slice of matches, global for backref Group!DataIndex[] matches, backrefed; size_t _refCount; final: override @property ref size_t refCount() { return _refCount; } override @property ref const(RegEx) pattern(){ return re; } static if (__traits(hasMember,Stream, "search")) { enum kicked = true; } else enum kicked = false; static size_t initialMemory(const ref RegEx re) { return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof; } static size_t stackSize(const ref RegEx re) { size_t itemSize = stateSize + re.ngroup * (Group!DataIndex).sizeof / size_t.sizeof; return initialStack * itemSize + 2; } @property bool atStart(){ return index == 0; } @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } void next() { if (!s.nextChar(front, index)) index = s.lastIndex; } void search() { static if (kicked) { if (!s.search(re.kickstart, front, index)) { index = s.lastIndex; } } else next(); } // void newStack() { auto chunk = mallocArray!(size_t)(stackSize(re)); chunk[0] = cast(size_t)(memory.ptr); chunk[1] = lastState; memory = chunk[2..$]; lastState = 0; } bool prevStack() { // pointer to previous block size_t* prev = cast(size_t*) memory.ptr[-2]; if (!prev) { // The last segment is freed in RegexMatch return false; } else { import core.memory : pureFree; // memory used in previous block size_t size = memory.ptr[-1]; pureFree(memory.ptr-2); memory = prev[0 .. size]; lastState = size; return true; } } void initExternalMemory(void[] memBlock) { merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock); merge[] = Trace.init; memory = cast(size_t[]) memBlock; memory[0] = 0; // hidden pointer memory[1] = 0; // used size memory = memory[2..$]; } void initialize(ref const RegEx program, Stream stream, void[] memBlock) { s = stream; exhausted = false; initExternalMemory(memBlock); backrefed = null; } override void dupTo(Matcher!Char m, void[] memBlock) { auto backtracking = cast(BacktrackingMatcher) m; backtracking.s = s; backtracking.front = front; backtracking.index = index; backtracking.exhausted = exhausted; backtracking.initExternalMemory(memBlock); } override Matcher!Char rearm(in Char[] data) { merge[] = Trace.init; exhausted = false; s = Stream(data); next(); return this; } this(ref const RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) { _refCount = 1; re = program; nativeFn = null; initialize(program, stream, memBlock); front = ch; index = idx; } this(ref const RegEx program, MatchFn func, Stream stream, void[] memBlock) { _refCount = 1; re = program; initialize(program, stream, memBlock); nativeFn = func; next(); } this(ref const RegEx program, Stream stream, void[] memBlock) { _refCount = 1; re = program; nativeFn = null; initialize(program, stream, memBlock); next(); } auto fwdMatcher(ref const RegEx re, void[] memBlock) { alias BackMatcher = BacktrackingMatcher!(Char, Stream); auto fwdMatcher = new BackMatcher(re, s, memBlock, front, index); return fwdMatcher; } auto bwdMatcher(ref const RegEx re, void[] memBlock) { alias BackMatcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); auto fwdMatcher = new BackMatcher(re, s.loopBack(index), memBlock); return fwdMatcher; } // int matchFinalize() { immutable start = index; immutable val = matchImpl(); if (val) {//stream is updated here matches[0].begin = start; matches[0].end = index; if (!(re.flags & RegexOption.global) || atEnd) exhausted = true; if (start == index)//empty match advances input next(); return val; } else return 0; } //lookup next match, fill matches with indices into input override int match(Group!DataIndex[] matches) { debug(std_regex_matcher) { writeln("------------------------------------------"); } if (exhausted) //all matches collected return false; this.matches = matches; if (re.flags & RegexInfo.oneShot) { exhausted = true; const DataIndex start = index; immutable m = matchImpl(); if (m) { matches[0].begin = start; matches[0].end = index; } return m; } static if (kicked) { if (!re.kickstart.empty) { for (;;) { immutable val = matchFinalize(); if (val) return val; else { if (atEnd) break; search(); if (atEnd) { exhausted = true; return matchFinalize(); } } } exhausted = true; return 0; //early return } } //no search available - skip a char at a time for (;;) { immutable val = matchFinalize(); if (val) return val; else { if (atEnd) break; next(); if (atEnd) { exhausted = true; return matchFinalize(); } } } exhausted = true; return 0; } /+ match subexpression against input, results are stored in matches +/ int matchImpl() pure { if (nativeFn) { debug(std_regex_ctr) writeln("using C-T matcher"); return nativeFn(this); } else { pc = 0; counter = 0; lastState = 0; infiniteNesting = 0; matches[] = Group!DataIndex.init; auto start = s._index; debug(std_regex_matcher) writeln("Try match starting at ", s[index .. s.lastIndex]); for (;;) { debug(std_regex_matcher) writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s", pc, counter, disassemble(re.ir, pc, re.dict), front, s._index); switch (re.ir[pc].code) { case IR.OrChar://assumes IRL!(OrChar) == 1 if (atEnd) goto L_backtrack; uint len = re.ir[pc].sequence; uint end = pc + len; if (re.ir[pc].data != front && re.ir[pc+1].data != front) { for (pc = pc+2; pc < end; pc++) if (re.ir[pc].data == front) break; if (pc == end) goto L_backtrack; } pc = end; next(); break; case IR.Char: if (atEnd || front != re.ir[pc].data) goto L_backtrack; pc += IRL!(IR.Char); next(); break; case IR.Any: if (atEnd) goto L_backtrack; pc += IRL!(IR.Any); next(); break; case IR.CodepointSet: if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front)) goto L_backtrack; next(); pc += IRL!(IR.CodepointSet); break; case IR.Trie: if (atEnd || !re.matchers[re.ir[pc].data][front]) goto L_backtrack; next(); pc += IRL!(IR.Trie); break; case IR.Wordboundary: dchar back; DataIndex bi; //at start & end of input if (atStart && wordMatcher[front]) { pc += IRL!(IR.Wordboundary); break; } else if (atEnd && s.loopBack(index).nextChar(back, bi) && wordMatcher[back]) { pc += IRL!(IR.Wordboundary); break; } else if (s.loopBack(index).nextChar(back, bi)) { immutable af = wordMatcher[front]; immutable ab = wordMatcher[back]; if (af ^ ab) { pc += IRL!(IR.Wordboundary); break; } } goto L_backtrack; case IR.Notwordboundary: dchar back; DataIndex bi; //at start & end of input if (atStart && wordMatcher[front]) goto L_backtrack; else if (atEnd && s.loopBack(index).nextChar(back, bi) && wordMatcher[back]) goto L_backtrack; else if (s.loopBack(index).nextChar(back, bi)) { immutable af = wordMatcher[front]; immutable ab = wordMatcher[back]; if (af ^ ab) goto L_backtrack; } pc += IRL!(IR.Wordboundary); break; case IR.Bof: if (atStart) pc += IRL!(IR.Bol); else goto L_backtrack; break; case IR.Bol: dchar back; DataIndex bi; if (atStart) pc += IRL!(IR.Bol); else if (s.loopBack(index).nextChar(back,bi) && endOfLine(back, front == '\n')) { pc += IRL!(IR.Bol); } else goto L_backtrack; break; case IR.Eof: if (atEnd) pc += IRL!(IR.Eol); else goto L_backtrack; break; case IR.Eol: dchar back; DataIndex bi; debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]); //no matching inside \r\n if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi) && back == '\r'))) { pc += IRL!(IR.Eol); } else goto L_backtrack; break; case IR.InfiniteStart, IR.InfiniteQStart: pc += re.ir[pc].data + IRL!(IR.InfiniteStart); //now pc is at end IR.Infinite(Q)End uint len = re.ir[pc].data; if (re.ir[pc].code == IR.InfiniteEnd) { pushState(pc+IRL!(IR.InfiniteEnd), counter); pc -= len; } else { pushState(pc - len, counter); pc += IRL!(IR.InfiniteEnd); } break; case IR.InfiniteBloomStart: pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart); //now pc is at end IR.InfiniteBloomEnd immutable len = re.ir[pc].data; immutable filterIdx = re.ir[pc+2].raw; if (re.filters[filterIdx][front]) pushState(pc+IRL!(IR.InfiniteBloomEnd), counter); pc -= len; break; case IR.RepeatStart, IR.RepeatQStart: pc += re.ir[pc].data + IRL!(IR.RepeatStart); break; case IR.RepeatEnd: case IR.RepeatQEnd: if (merge[re.ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; } //len, step, min, max immutable len = re.ir[pc].data; immutable step = re.ir[pc+2].raw; immutable min = re.ir[pc+3].raw; immutable max = re.ir[pc+4].raw; if (counter < min) { counter += step; pc -= len; } else if (counter < max) { if (re.ir[pc].code == IR.RepeatEnd) { pushState(pc + IRL!(IR.RepeatEnd), counter%step); counter += step; pc -= len; } else { pushState(pc - len, counter + step); counter = counter%step; pc += IRL!(IR.RepeatEnd); } } else { counter = counter%step; pc += IRL!(IR.RepeatEnd); } break; case IR.InfiniteEnd: case IR.InfiniteQEnd: debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); if (merge[re.ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; } immutable len = re.ir[pc].data; if (re.ir[pc].code == IR.InfiniteEnd) { pushState(pc + IRL!(IR.InfiniteEnd), counter); pc -= len; } else { pushState(pc-len, counter); pc += IRL!(IR.InfiniteEnd); } break; case IR.InfiniteBloomEnd: debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); if (merge[re.ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; } immutable len = re.ir[pc].data; immutable filterIdx = re.ir[pc+2].raw; if (re.filters[filterIdx][front]) { infiniteNesting--; pushState(pc + IRL!(IR.InfiniteBloomEnd), counter); infiniteNesting++; } pc -= len; break; case IR.OrEnd: if (merge[re.ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; } pc += IRL!(IR.OrEnd); break; case IR.OrStart: pc += IRL!(IR.OrStart); goto case; case IR.Option: immutable len = re.ir[pc].data; if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one { pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch } pc += IRL!(IR.Option); break; case IR.GotoEndOr: pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr); break; case IR.GroupStart: immutable n = re.ir[pc].data; matches[n].begin = index; debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index); pc += IRL!(IR.GroupStart); break; case IR.GroupEnd: immutable n = re.ir[pc].data; matches[n].end = index; debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index); pc += IRL!(IR.GroupEnd); break; case IR.LookaheadStart: case IR.NeglookaheadStart: immutable len = re.ir[pc].data; auto save = index; immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; auto mem = pureMalloc(initialMemory(re))[0 .. initialMemory(re)]; scope(exit) pureFree(mem.ptr); auto slicedRe = re.withCode(re.ir[ pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd) ]); static if (Stream.isLoopback) { auto matcher = bwdMatcher(slicedRe, mem); } else { auto matcher = fwdMatcher(slicedRe, mem); } matcher.matches = matches[ms .. me]; matcher.backrefed = backrefed.empty ? matches : backrefed; immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart); s.reset(save); next(); if (!match) goto L_backtrack; else { pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd); } break; case IR.LookbehindStart: case IR.NeglookbehindStart: immutable len = re.ir[pc].data; immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; auto mem = pureMalloc(initialMemory(re))[0 .. initialMemory(re)]; scope(exit) pureFree(mem.ptr); auto slicedRe = re.withCode(re.ir[ pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd) ]); static if (Stream.isLoopback) { alias Matcher = BacktrackingMatcher!(Char, Stream); auto matcher = new Matcher(slicedRe, s, mem, front, index); } else { alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); auto matcher = new Matcher(slicedRe, s.loopBack(index), mem); } matcher.matches = matches[ms .. me]; matcher.backrefed = backrefed.empty ? matches : backrefed; immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart); if (!match) goto L_backtrack; else { pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd); } break; case IR.Backref: immutable n = re.ir[pc].data; auto referenced = re.ir[pc].localRef ? s[matches[n].begin .. matches[n].end] : s[backrefed[n].begin .. backrefed[n].end]; while (!atEnd && !referenced.empty && front == referenced.front) { next(); referenced.popFront(); } if (referenced.empty) pc++; else goto L_backtrack; break; case IR.Nop: pc += IRL!(IR.Nop); break; case IR.LookaheadEnd: case IR.NeglookaheadEnd: case IR.LookbehindEnd: case IR.NeglookbehindEnd: case IR.End: // cleanup stale stack blocks if any while (prevStack()) {} return re.ir[pc].data; default: debug printBytecode(re.ir[0..$]); assert(0); L_backtrack: if (!popState()) { s.reset(start); return 0; } } } } assert(0); } @property size_t stackAvail() { return memory.length - lastState; } void stackPush(T)(T val) if (!isDynamicArray!T) { *cast(T*)&memory[lastState] = val; enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; lastState += delta; debug(std_regex_matcher) writeln("push element SP= ", lastState); } void stackPush(T)(T[] val) { static assert(T.sizeof % size_t.sizeof == 0); (cast(T*)&memory[lastState])[0 .. val.length] = val[0..$]; lastState += val.length*(T.sizeof/size_t.sizeof); debug(std_regex_matcher) writeln("push array SP= ", lastState); } void stackPop(T)(ref T val) if (!isDynamicArray!T) { enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; lastState -= delta; val = *cast(T*)&memory[lastState]; debug(std_regex_matcher) writeln("pop element SP= ", lastState); } void stackPop(T)(T[] val) { stackPop(val); // call ref version } void stackPop(T)(ref T[] val) { lastState -= val.length*(T.sizeof/size_t.sizeof); val[0..$] = (cast(T*)&memory[lastState])[0 .. val.length]; debug(std_regex_matcher) writeln("pop array SP= ", lastState); } //helper function, saves engine state void pushState(uint pc, uint counter) { if (stateSize + 2 * matches.length > stackAvail) { newStack(); } *cast(State*)&memory[lastState] = State(index, pc, counter, infiniteNesting); lastState += stateSize; memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[]) matches)[]; lastState += 2*matches.length; debug(std_regex_matcher) writefln("Saved(pc=%s) front: %s src: %s", pc, front, s[index .. s.lastIndex]); } //helper function, restores engine state bool popState() { if (!lastState && !prevStack()) return false; lastState -= 2*matches.length; auto pm = cast(size_t[]) matches; pm[] = memory[lastState .. lastState + 2 * matches.length]; lastState -= stateSize; State* state = cast(State*)&memory[lastState]; index = state.index; pc = state.pc; counter = state.counter; infiniteNesting = state.infiniteNesting; debug(std_regex_matcher) { writefln("Restored matches", front, s[index .. s.lastIndex]); foreach (i, m; matches) writefln("Sub(%d) : %s..%s", i, m.begin, m.end); } s.reset(index); next(); debug(std_regex_matcher) writefln("Backtracked (pc=%s) front: %s src: %s", pc, front, s[index .. s.lastIndex]); return true; } } //very shitty string formatter, $$ replaced with next argument converted to string @trusted string ctSub( U...)(string format, U args) { import std.conv : to; bool seenDollar; foreach (i, ch; format) { if (ch == '$') { if (seenDollar) { static if (args.length > 0) { return format[0 .. i - 1] ~ to!string(args[0]) ~ ctSub(format[i + 1 .. $], args[1 .. $]); } else assert(0); } else seenDollar = true; } else seenDollar = false; } return format; } struct CtContext { import std.conv : to, text; //dirty flags bool counter; //to mark the portion of matches to save int match, total_matches; int reserved; const(CodepointInterval)[][] charsets; //state of codegenerator static struct CtState { string code; int addr; } this(Char)(ref const Regex!Char re) { match = 1; reserved = 1; //first match is skipped total_matches = re.ngroup; foreach (ref set; re.charsets) { charsets ~= set.intervals; } } CtContext lookaround(uint s, uint e) { CtContext ct; ct.total_matches = e - s; ct.match = 1; return ct; } //restore state having current context string restoreCode() { string text; //stack is checked in L_backtrack text ~= counter ? " stackPop(counter);" : " counter = 0;"; if (match < total_matches) { text ~= ctSub(" stackPop(matches[$$..$$]);", reserved, match); text ~= ctSub(" matches[$$..$] = typeof(matches[0]).init;", match); } else text ~= ctSub(" stackPop(matches[$$..$]);", reserved); return text; } //save state having current context string saveCode(uint pc, string count_expr="counter") { string text = ctSub(" if (stackAvail < $$*(Group!(DataIndex)).sizeof/size_t.sizeof + $$) { newStack(); }", match - reserved, cast(int) counter + 2); if (match < total_matches) text ~= ctSub(" stackPush(matches[$$..$$]);", reserved, match); else text ~= ctSub(" stackPush(matches[$$..$]);", reserved); text ~= counter ? ctSub(" stackPush($$);", count_expr) : ""; text ~= ctSub(" stackPush(index); stackPush($$); \n", pc); return text; } // CtState ctGenBlock(const(Bytecode)[] ir, int addr) { CtState result; result.addr = addr; while (!ir.empty) { auto n = ctGenGroup(ir, result.addr); result.code ~= n.code; result.addr = n.addr; } return result; } // CtState ctGenGroup(ref const(Bytecode)[] ir, int addr) { import std.algorithm.comparison : max; auto bailOut = "goto L_backtrack;"; auto nextInstr = ctSub("goto case $$;", addr+1); CtState r; assert(!ir.empty); switch (ir[0].code) { case IR.InfiniteStart, IR.InfiniteBloomStart,IR.InfiniteQStart, IR.RepeatStart, IR.RepeatQStart: immutable infLoop = ir[0].code == IR.InfiniteStart || ir[0].code == IR.InfiniteQStart || ir[0].code == IR.InfiniteBloomStart; counter = counter || ir[0].code == IR.RepeatStart || ir[0].code == IR.RepeatQStart; immutable len = ir[0].data; auto nir = ir[ir[0].length .. ir[0].length+len]; r = ctGenBlock(nir, addr+1); //start/end codegen //r.addr is at last test+ jump of loop, addr+1 is body of loop nir = ir[ir[0].length + len .. $]; r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr) ~ r.code; r.code ~= ctGenFixupCode(nir, r.addr, addr+1); r.addr += 2; //account end instruction + restore state ir = nir; break; case IR.OrStart: immutable len = ir[0].data; auto nir = ir[ir[0].length .. ir[0].length+len]; r = ctGenAlternation(nir, addr); ir = ir[ir[0].length + len .. $]; assert(ir[0].code == IR.OrEnd); ir = ir[ir[0].length..$]; break; case IR.LookaheadStart: case IR.NeglookaheadStart: case IR.LookbehindStart: case IR.NeglookbehindStart: immutable len = ir[0].data; immutable behind = ir[0].code == IR.LookbehindStart || ir[0].code == IR.NeglookbehindStart; immutable negative = ir[0].code == IR.NeglookaheadStart || ir[0].code == IR.NeglookbehindStart; string fwdType = "typeof(fwdMatcher(re, []))"; string bwdType = "typeof(bwdMatcher(re, []))"; string fwdCreate = "fwdMatcher(re, mem)"; string bwdCreate = "bwdMatcher(re, mem)"; immutable start = IRL!(IR.LookbehindStart); immutable end = IRL!(IR.LookbehindStart)+len+IRL!(IR.LookaheadEnd); CtContext context = lookaround(ir[1].raw, ir[2].raw); //split off new context auto slice = ir[start .. end]; r.code ~= ctSub(` case $$: //fake lookaround "atom" static if (typeof(matcher.s).isLoopback) alias Lookaround = $$; else alias Lookaround = $$; static bool matcher_$$(Lookaround matcher) @trusted { //(neg)lookaround piece start $$ //(neg)lookaround piece ends } auto save = index; auto mem = pureMalloc(initialMemory(re))[0 .. initialMemory(re)]; scope(exit) pureFree(mem.ptr); static if (typeof(matcher.s).isLoopback) auto lookaround = $$; else auto lookaround = $$; lookaround.matches = matches[$$..$$]; lookaround.backrefed = backrefed.empty ? matches : backrefed; lookaround.nativeFn = &matcher_$$; //hookup closure's binary code int match = $$; s.reset(save); next(); if (match) $$ else $$`, addr, behind ? fwdType : bwdType, behind ? bwdType : fwdType, addr, context.ctGenRegEx(slice), behind ? fwdCreate : bwdCreate, behind ? bwdCreate : fwdCreate, ir[1].raw, ir[2].raw, //start - end of matches slice addr, negative ? "!lookaround.matchImpl()" : "lookaround.matchImpl()", nextInstr, bailOut); ir = ir[end .. $]; r.addr = addr + 1; break; case IR.LookaheadEnd: case IR.NeglookaheadEnd: case IR.LookbehindEnd: case IR.NeglookbehindEnd: ir = ir[IRL!(IR.LookaheadEnd) .. $]; r.addr = addr; break; default: assert(ir[0].isAtom, text(ir[0].mnemonic)); r = ctGenAtom(ir, addr); } return r; } //generate source for bytecode contained in OrStart ... OrEnd CtState ctGenAlternation(const(Bytecode)[] ir, int addr) { CtState[] pieces; CtState r; enum optL = IRL!(IR.Option); for (;;) { assert(ir[0].code == IR.Option); auto len = ir[0].data; if (optL+len < ir.length && ir[optL+len].code == IR.Option)//not a last option { auto nir = ir[optL .. optL+len-IRL!(IR.GotoEndOr)]; r = ctGenBlock(nir, addr+2);//space for Option + restore state //r.addr+1 to account GotoEndOr at end of branch r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr+1) ~ r.code; addr = r.addr+1;//leave space for GotoEndOr pieces ~= r; ir = ir[optL + len .. $]; } else { pieces ~= ctGenBlock(ir[optL..$], addr); addr = pieces[$-1].addr; break; } } r = pieces[0]; for (uint i = 1; i < pieces.length; i++) { r.code ~= ctSub(` case $$: goto case $$; `, pieces[i-1].addr, addr); r.code ~= pieces[i].code; } r.addr = addr; return r; } // generate fixup code for instruction in ir, // fixup means it has an alternative way for control flow string ctGenFixupCode(const(Bytecode)[] ir, int addr, int fixup) { return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version } string ctGenFixupCode(ref const(Bytecode)[] ir, int addr, int fixup) { string r; string testCode; r = ctSub(` case $$: debug(std_regex_matcher) writeln("#$$");`, addr, addr); switch (ir[0].code) { case IR.InfiniteStart, IR.InfiniteQStart, IR.InfiniteBloomStart: r ~= ctSub( ` goto case $$;`, fixup); ir = ir[ir[0].length..$]; break; case IR.InfiniteEnd: testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1); r ~= ctSub( ` if (merge[$$+counter].mark(index)) { // merged! goto L_backtrack; } $$ { $$ } goto case $$; case $$: //restore state and go out of loop $$ goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup, addr+1, restoreCode()); ir = ir[ir[0].length..$]; break; case IR.InfiniteBloomEnd: //TODO: check bloom filter and skip on failure testCode = ctQuickTest(ir[IRL!(IR.InfiniteBloomEnd) .. $],addr + 1); r ~= ctSub( ` if (merge[$$+counter].mark(index)) { // merged! goto L_backtrack; } $$ { $$ } goto case $$; case $$: //restore state and go out of loop $$ goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup, addr+1, restoreCode()); ir = ir[ir[0].length..$]; break; case IR.InfiniteQEnd: testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1); auto altCode = testCode.length ? ctSub("else goto case $$;", fixup) : ""; r ~= ctSub( ` if (merge[$$+counter].mark(index)) { // merged! goto L_backtrack; } $$ { $$ goto case $$; } $$ case $$://restore state and go inside loop $$ goto case $$;`, ir[1].raw, testCode, saveCode(addr+1), addr+2, altCode, addr+1, restoreCode(), fixup); ir = ir[ir[0].length..$]; break; case IR.RepeatStart, IR.RepeatQStart: r ~= ctSub( ` goto case $$;`, fixup); ir = ir[ir[0].length..$]; break; case IR.RepeatEnd, IR.RepeatQEnd: //len, step, min, max immutable len = ir[0].data; immutable step = ir[2].raw; immutable min = ir[3].raw; immutable max = ir[4].raw; r ~= ctSub(` if (merge[$$+counter].mark(index)) { // merged! goto L_backtrack; } if (counter < $$) { debug(std_regex_matcher) writeln("RepeatEnd min case pc=", $$); counter += $$; goto case $$; }`, ir[1].raw, min, addr, step, fixup); if (ir[0].code == IR.RepeatEnd) { string counter_expr = ctSub("counter % $$", step); r ~= ctSub(` else if (counter < $$) { $$ counter += $$; goto case $$; }`, max, saveCode(addr+1, counter_expr), step, fixup); } else { string counter_expr = ctSub("counter % $$", step); r ~= ctSub(` else if (counter < $$) { $$ counter = counter % $$; goto case $$; }`, max, saveCode(addr+1,counter_expr), step, addr+2); } r ~= ctSub(` else { counter = counter % $$; goto case $$; } case $$: //restore state $$ goto case $$;`, step, addr+2, addr+1, restoreCode(), ir[0].code == IR.RepeatEnd ? addr+2 : fixup ); ir = ir[ir[0].length..$]; break; case IR.Option: r ~= ctSub( ` { $$ } goto case $$; case $$://restore thunk to go to the next group $$ goto case $$;`, saveCode(addr+1), addr+2, addr+1, restoreCode(), fixup); ir = ir[ir[0].length..$]; break; default: assert(0, text(ir[0].mnemonic)); } return r; } string ctQuickTest(const(Bytecode)[] ir, int id) { uint pc = 0; while (pc < ir.length && ir[pc].isAtom) { if (ir[pc].code == IR.GroupStart || ir[pc].code == IR.GroupEnd) { pc++; } else if (ir[pc].code == IR.Backref) break; else { auto code = ctAtomCode(ir[pc..$], -1); return ctSub(` int test_$$() { $$ //$$ } if (test_$$() >= 0)`, id, code.ptr ? code : "return 0;", ir[pc].mnemonic, id); } } return ""; } //process & generate source for simple bytecodes at front of ir using address addr CtState ctGenAtom(ref const(Bytecode)[] ir, int addr) { CtState result; result.code = ctAtomCode(ir, addr); ir.popFrontN(ir[0].code == IR.OrChar ? ir[0].sequence : ir[0].length); result.addr = addr + 1; return result; } //D code for atom at ir using address addr, addr < 0 means quickTest string ctAtomCode(const(Bytecode)[] ir, int addr) { string code; string bailOut, nextInstr; if (addr < 0) { bailOut = "return -1;"; nextInstr = "return 0;"; } else { bailOut = "goto L_backtrack;"; nextInstr = ctSub("goto case $$;", addr+1); code ~= ctSub( ` case $$: debug(std_regex_matcher) writeln("#$$"); `, addr, addr); } switch (ir[0].code) { case IR.OrChar://assumes IRL!(OrChar) == 1 code ~= ctSub(` if (atEnd) $$`, bailOut); immutable len = ir[0].sequence; for (uint i = 0; i < len; i++) { code ~= ctSub( ` if (front == $$) { $$ $$ }`, ir[i].data, addr >= 0 ? "next();" :"", nextInstr); } code ~= ctSub( ` $$`, bailOut); break; case IR.Char: code ~= ctSub( ` if (atEnd || front != $$) $$ $$ $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); break; case IR.Any: code ~= ctSub( ` if (atEnd || (!(re.flags & RegexOption.singleline) && (front == '\r' || front == '\n'))) $$ $$ $$`, bailOut, addr >= 0 ? "next();" :"",nextInstr); break; case IR.CodepointSet: if (charsets.length) { string name = `func_`~to!string(addr+1); string funcCode = CodepointSet.toSourceCode(charsets[ir[0].data], name); code ~= ctSub( ` static $$ if (atEnd || !$$(front)) $$ $$ $$`, funcCode, name, bailOut, addr >= 0 ? "next();" :"", nextInstr); } else code ~= ctSub( ` if (atEnd || !re.charsets[$$].scanFor(front)) $$ $$ $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); break; case IR.Trie: if (charsets.length && charsets[ir[0].data].length <= 8) goto case IR.CodepointSet; code ~= ctSub( ` if (atEnd || !re.matchers[$$][front]) $$ $$ $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); break; case IR.Wordboundary: code ~= ctSub( ` dchar back; DataIndex bi; if (atStart && wordMatcher[front]) { $$ } else if (atEnd && s.loopBack(index).nextChar(back, bi) && wordMatcher[back]) { $$ } else if (s.loopBack(index).nextChar(back, bi)) { bool af = wordMatcher[front]; bool ab = wordMatcher[back]; if (af ^ ab) { $$ } } $$`, nextInstr, nextInstr, nextInstr, bailOut); break; case IR.Notwordboundary: code ~= ctSub( ` dchar back; DataIndex bi; //at start & end of input if (atStart && wordMatcher[front]) $$ else if (atEnd && s.loopBack(index).nextChar(back, bi) && wordMatcher[back]) $$ else if (s.loopBack(index).nextChar(back, bi)) { bool af = wordMatcher[front]; bool ab = wordMatcher[back]; if (af ^ ab) $$ } $$`, bailOut, bailOut, bailOut, nextInstr); break; case IR.Bol: code ~= ctSub(` dchar back; DataIndex bi; if (atStart || (s.loopBack(index).nextChar(back,bi) && endOfLine(back, front == '\n'))) { debug(std_regex_matcher) writeln("BOL matched"); $$ } else $$`, nextInstr, bailOut); break; case IR.Bof: code ~= ctSub(` if (atStart) { debug(std_regex_matcher) writeln("BOF matched"); $$ } else $$`, nextInstr, bailOut); break; case IR.Eol: code ~= ctSub(` dchar back; DataIndex bi; debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]); //no matching inside \r\n if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi) && back == '\r'))) { debug(std_regex_matcher) writeln("EOL matched"); $$ } else $$`, nextInstr, bailOut); break; case IR.Eof: code ~= ctSub(` if (atEnd) { debug(std_regex_matcher) writeln("BOF matched"); $$ } else $$`, nextInstr, bailOut); break; case IR.GroupStart: code ~= ctSub(` matches[$$].begin = index; $$`, ir[0].data, nextInstr); match = ir[0].data+1; break; case IR.GroupEnd: code ~= ctSub(` matches[$$].end = index; $$`, ir[0].data, nextInstr); break; case IR.Backref: string mStr = "auto referenced = "; mStr ~= ir[0].localRef ? ctSub("s[matches[$$].begin .. matches[$$].end];", ir[0].data, ir[0].data) : ctSub("s[backrefed[$$].begin .. backrefed[$$].end];", ir[0].data, ir[0].data); code ~= ctSub( ` $$ while (!atEnd && !referenced.empty && front == referenced.front) { next(); referenced.popFront(); } if (referenced.empty) $$ else $$`, mStr, nextInstr, bailOut); break; case IR.Nop: case IR.End: break; default: assert(0, text(ir[0].mnemonic, " is not supported yet")); } return code; } //generate D code for the whole regex public string ctGenRegEx(const(Bytecode)[] ir) { auto bdy = ctGenBlock(ir, 0); auto r = ` import core.memory : pureMalloc, pureFree; with(matcher) { pc = 0; counter = 0; lastState = 0; matches[] = Group!DataIndex.init; auto start = s._index;`; r ~= ` goto StartLoop; debug(std_regex_matcher) writeln("Try CT matching starting at ",s[index .. s.lastIndex]); L_backtrack: if (lastState || prevStack()) { stackPop(pc); stackPop(index); s.reset(index); next(); } else { s.reset(start); return false; } StartLoop: switch (pc) { `; r ~= bdy.code; r ~= ctSub(` case $$: break;`,bdy.addr); r ~= ` default: assert(0); } // cleanup stale stack blocks while (prevStack()) {} return true; } `; return r; } } string ctGenRegExCode(Char)(const Regex!Char re) { auto context = CtContext(re); return context.ctGenRegEx(re.ir); }