/** * Implements the lexical analyzer, which converts source code into lexical tokens. * * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical) * * Copyright: Copyright (C) 1999-2022 by The D Language Foundation, All Rights Reserved * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright) * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d) * Documentation: https://dlang.org/phobos/dmd_lexer.html * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d */ module dmd.lexer; import core.stdc.ctype; import core.stdc.errno; import core.stdc.stdarg; import core.stdc.stdio; import core.stdc.stdlib : getenv; import core.stdc.string; import core.stdc.time; import dmd.entity; import dmd.errors; import dmd.globals; import dmd.id; import dmd.identifier; import dmd.root.array; import dmd.root.ctfloat; import dmd.common.outbuffer; import dmd.root.port; import dmd.root.rmem; import dmd.root.string; import dmd.root.utf; import dmd.tokens; import dmd.utils; nothrow: version (DMDLIB) { version = LocOffset; } /*********************************************************** */ class Lexer { private __gshared OutBuffer stringbuffer; Loc scanloc; // for error messages Loc prevloc; // location of token before current const(char)* p; // current character Token token; // For ImportC bool Ccompile; /// true if compiling ImportC // The following are valid only if (Ccompile == true) ubyte boolsize; /// size of a C _Bool, default 1 ubyte shortsize; /// size of a C short, default 2 ubyte intsize; /// size of a C int, default 4 ubyte longsize; /// size of C long, 4 or 8 ubyte long_longsize; /// size of a C long long, default 8 ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof ubyte wchar_tsize; /// size of C wchar_t, 2 or 4 private { const(char)* base; // pointer to start of buffer const(char)* end; // pointer to last element of buffer const(char)* line; // start of current line bool doDocComment; // collect doc comment information bool anyToken; // seen at least one token bool commentToken; // comments are TOK.comment's bool tokenizeNewlines; // newlines are turned into TOK.endOfLine's version (DMDLIB) { bool whitespaceToken; // tokenize whitespaces } int inTokenStringConstant; // can be larger than 1 when in nested q{} strings int lastDocLine; // last line of previous doc comment Token* tokenFreelist; } nothrow: /********************* * Creates a Lexer for the source code base[begoffset..endoffset+1]. * The last character, base[endoffset], must be null (0) or EOF (0x1A). * * Params: * filename = used for error messages * base = source code, must be terminated by a null (0) or EOF (0x1A) character * begoffset = starting offset into base[] * endoffset = the last offset to read into base[] * doDocComment = handle documentation comments * commentToken = comments become TOK.comment's */ this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, bool doDocComment, bool commentToken) pure { scanloc = Loc(filename, 1, 1); // debug printf("Lexer::Lexer(%p)\n", base); // debug printf("lexer.filename = %s\n", filename); token = Token.init; this.base = base; this.end = base + endoffset; p = base + begoffset; line = p; this.doDocComment = doDocComment; this.commentToken = commentToken; this.tokenizeNewlines = false; this.inTokenStringConstant = 0; this.lastDocLine = 0; //initKeywords(); /* If first line starts with '#!', ignore the line */ if (p && p[0] == '#' && p[1] == '!') { p += 2; while (1) { char c = *p++; switch (c) { case 0: case 0x1A: p--; goto case; case '\n': break; default: continue; } break; } endOfLine(); } } version (DMDLIB) { this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, bool doDocComment, bool commentToken, bool whitespaceToken) { this(filename, base, begoffset, endoffset, doDocComment, commentToken); this.whitespaceToken = whitespaceToken; } bool empty() const pure @property @nogc @safe { return front() == TOK.endOfFile; } TOK front() const pure @property @nogc @safe { return token.value; } void popFront() { nextToken(); } } /// Returns: a newly allocated `Token`. Token* allocateToken() pure nothrow @safe { if (tokenFreelist) { Token* t = tokenFreelist; tokenFreelist = t.next; t.next = null; return t; } return new Token(); } /// Frees the given token by returning it to the freelist. private void releaseToken(Token* token) pure nothrow @nogc @safe { if (mem.isGCEnabled) *token = Token.init; token.next = tokenFreelist; tokenFreelist = token; } final TOK nextToken() { prevloc = token.loc; if (token.next) { Token* t = token.next; memcpy(&token, t, Token.sizeof); releaseToken(t); } else { scan(&token); } //printf(token.toChars()); return token.value; } /*********************** * Look ahead at next token's value. */ final TOK peekNext() { return peek(&token).value; } /*********************** * Look 2 tokens ahead at value. */ final TOK peekNext2() { Token* t = peek(&token); return peek(t).value; } /**************************** * Turn next token in buffer into a token. * Params: * t = the token to set the resulting Token to */ final void scan(Token* t) { const lastLine = scanloc.linnum; Loc startLoc; t.blockComment = null; t.lineComment = null; while (1) { t.ptr = p; //printf("p = %p, *p = '%c'\n",p,*p); t.loc = loc(); switch (*p) { case 0: case 0x1A: t.value = TOK.endOfFile; // end of file // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile. return; case ' ': // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary. while ((cast(size_t)p) % uint.sizeof) { if (*p != ' ') goto LendSkipFourSpaces; p++; } while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20 p += 4; // Skip over any remaining space on the line. while (*p == ' ') p++; LendSkipFourSpaces: version (DMDLIB) { if (whitespaceToken) { t.value = TOK.whitespace; return; } } continue; // skip white space case '\t': case '\v': case '\f': p++; version (DMDLIB) { if (whitespaceToken) { t.value = TOK.whitespace; return; } } continue; // skip white space case '\r': p++; if (*p != '\n') // if CR stands by itself { endOfLine(); if (tokenizeNewlines) { t.value = TOK.endOfLine; tokenizeNewlines = false; return; } } version (DMDLIB) { if (whitespaceToken) { t.value = TOK.whitespace; return; } } continue; // skip white space case '\n': p++; endOfLine(); if (tokenizeNewlines) { t.value = TOK.endOfLine; tokenizeNewlines = false; return; } version (DMDLIB) { if (whitespaceToken) { t.value = TOK.whitespace; return; } } continue; // skip white space case '0': if (!isZeroSecond(p[1])) // if numeric literal does not continue { ++p; t.unsvalue = 0; t.value = TOK.int32Literal; return; } goto Lnumber; case '1': .. case '9': if (!isDigitSecond(p[1])) // if numeric literal does not continue { t.unsvalue = *p - '0'; ++p; t.value = TOK.int32Literal; return; } Lnumber: t.value = number(t); return; case '\'': if (issinglechar(p[1]) && p[2] == '\'') { t.unsvalue = p[1]; // simple one character literal t.value = TOK.charLiteral; p += 3; } else if (Ccompile) { clexerCharConstant(*t, 0); } else { t.value = charConstant(t); } return; case 'u': case 'U': case 'L': if (!Ccompile) goto case_ident; if (p[1] == '\'') // C wide character constant { char c = *p; if (c == 'L') // convert L to u or U c = (wchar_tsize == 4) ? 'u' : 'U'; ++p; clexerCharConstant(*t, c); return; } else if (p[1] == '\"') // C wide string literal { const c = *p; ++p; escapeStringConstant(t); t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') : c == 'u' ? 'w' : 'd'; return; } else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal { p += 2; escapeStringConstant(t); return; } goto case_ident; case 'r': if (Ccompile || p[1] != '"') goto case_ident; p++; goto case '`'; case '`': if (Ccompile) goto default; wysiwygStringConstant(t); return; case 'q': if (Ccompile) goto case_ident; if (p[1] == '"') { p++; delimitedStringConstant(t); return; } else if (p[1] == '{') { p++; tokenStringConstant(t); return; } else goto case_ident; case '"': escapeStringConstant(t); return; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': /*case 'q': case 'r':*/ case 's': case 't': //case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': //case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': //case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': case_ident: { while (1) { const c = *++p; if (isidchar(c)) continue; else if (c & 0x80) { const s = p; const u = decodeUTF(); if (isUniAlpha(u)) continue; error("char 0x%04x not allowed in identifier", u); p = s; } break; } Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr)); t.ident = id; t.value = cast(TOK)id.getValue(); anyToken = 1; /* Different keywords for C and D */ if (Ccompile) { if (t.value != TOK.identifier) { t.value = Ckeywords[t.value]; // filter out D keywords } } else if (t.value >= FirstCKeyword) t.value = TOK.identifier; // filter out C keywords else if (*t.ptr == '_') // if special identifier token { // Lazy initialization TimeStampInfo.initialize(t.loc); if (id == Id.DATE) { t.ustring = TimeStampInfo.date.ptr; goto Lstr; } else if (id == Id.TIME) { t.ustring = TimeStampInfo.time.ptr; goto Lstr; } else if (id == Id.VENDOR) { t.ustring = global.vendor.xarraydup.ptr; goto Lstr; } else if (id == Id.TIMESTAMP) { t.ustring = TimeStampInfo.timestamp.ptr; Lstr: t.value = TOK.string_; t.postfix = 0; t.len = cast(uint)strlen(t.ustring); } else if (id == Id.VERSIONX) { t.value = TOK.int64Literal; t.unsvalue = global.versionNumber(); } else if (id == Id.EOFX) { t.value = TOK.endOfFile; // Advance scanner to end of file while (!(*p == 0 || *p == 0x1A)) p++; } } //printf("t.value = %d\n",t.value); return; } case '/': p++; switch (*p) { case '=': p++; t.value = TOK.divAssign; return; case '*': p++; startLoc = loc(); while (1) { while (1) { const c = *p; switch (c) { case '/': break; case '\n': endOfLine(); p++; continue; case '\r': p++; if (*p != '\n') endOfLine(); continue; case 0: case 0x1A: error("unterminated /* */ comment"); p = end; t.loc = loc(); t.value = TOK.endOfFile; return; default: if (c & 0x80) { const u = decodeUTF(); if (u == PS || u == LS) endOfLine(); } p++; continue; } break; } p++; if (p[-2] == '*' && p - 3 != t.ptr) break; } if (commentToken) { t.loc = startLoc; t.value = TOK.comment; return; } else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) { // if /** but not /**/ getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); lastDocLine = scanloc.linnum; } continue; case '/': // do // style comments startLoc = loc(); while (1) { const c = *++p; switch (c) { case '\n': break; case '\r': if (p[1] == '\n') p++; break; case 0: case 0x1A: if (commentToken) { p = end; t.loc = startLoc; t.value = TOK.comment; return; } if (doDocComment && t.ptr[2] == '/') { getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); lastDocLine = scanloc.linnum; } p = end; t.loc = loc(); t.value = TOK.endOfFile; return; default: if (c & 0x80) { const u = decodeUTF(); if (u == PS || u == LS) break; } continue; } break; } if (commentToken) { version (DMDLIB) {} else { p++; endOfLine(); } t.loc = startLoc; t.value = TOK.comment; return; } if (doDocComment && t.ptr[2] == '/') { getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); lastDocLine = scanloc.linnum; } p++; endOfLine(); continue; case '+': if (!Ccompile) { int nest; startLoc = loc(); p++; nest = 1; while (1) { char c = *p; switch (c) { case '/': p++; if (*p == '+') { p++; nest++; } continue; case '+': p++; if (*p == '/') { p++; if (--nest == 0) break; } continue; case '\r': p++; if (*p != '\n') endOfLine(); continue; case '\n': endOfLine(); p++; continue; case 0: case 0x1A: error("unterminated /+ +/ comment"); p = end; t.loc = loc(); t.value = TOK.endOfFile; return; default: if (c & 0x80) { uint u = decodeUTF(); if (u == PS || u == LS) endOfLine(); } p++; continue; } break; } if (commentToken) { t.loc = startLoc; t.value = TOK.comment; return; } if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) { // if /++ but not /++/ getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); lastDocLine = scanloc.linnum; } continue; } break; default: break; } t.value = TOK.div; return; case '.': p++; if (isdigit(*p)) { /* Note that we don't allow ._1 and ._ as being * valid floating point numbers. */ p--; t.value = inreal(t); } else if (p[0] == '.') { if (p[1] == '.') { p += 2; t.value = TOK.dotDotDot; } else { p++; t.value = TOK.slice; } } else t.value = TOK.dot; return; case '&': p++; if (*p == '=') { p++; t.value = TOK.andAssign; } else if (*p == '&') { p++; t.value = TOK.andAnd; } else t.value = TOK.and; return; case '|': p++; if (*p == '=') { p++; t.value = TOK.orAssign; } else if (*p == '|') { p++; t.value = TOK.orOr; } else t.value = TOK.or; return; case '-': p++; if (*p == '=') { p++; t.value = TOK.minAssign; } else if (*p == '-') { p++; t.value = TOK.minusMinus; } else if (*p == '>') { ++p; t.value = TOK.arrow; } else t.value = TOK.min; return; case '+': p++; if (*p == '=') { p++; t.value = TOK.addAssign; } else if (*p == '+') { p++; t.value = TOK.plusPlus; } else t.value = TOK.add; return; case '<': p++; if (*p == '=') { p++; t.value = TOK.lessOrEqual; // <= } else if (*p == '<') { p++; if (*p == '=') { p++; t.value = TOK.leftShiftAssign; // <<= } else t.value = TOK.leftShift; // << } else if (*p == ':' && Ccompile) { ++p; t.value = TOK.leftBracket; // <: } else if (*p == '%' && Ccompile) { ++p; t.value = TOK.leftCurly; // <% } else t.value = TOK.lessThan; // < return; case '>': p++; if (*p == '=') { p++; t.value = TOK.greaterOrEqual; // >= } else if (*p == '>') { p++; if (*p == '=') { p++; t.value = TOK.rightShiftAssign; // >>= } else if (*p == '>') { p++; if (*p == '=') { p++; t.value = TOK.unsignedRightShiftAssign; // >>>= } else t.value = TOK.unsignedRightShift; // >>> } else t.value = TOK.rightShift; // >> } else t.value = TOK.greaterThan; // > return; case '!': p++; if (*p == '=') { p++; t.value = TOK.notEqual; // != } else t.value = TOK.not; // ! return; case '=': p++; if (*p == '=') { p++; t.value = TOK.equal; // == } else if (*p == '>') { p++; t.value = TOK.goesTo; // => } else t.value = TOK.assign; // = return; case '~': p++; if (*p == '=') { p++; t.value = TOK.concatenateAssign; // ~= } else t.value = TOK.tilde; // ~ return; case '^': p++; if (*p == '^') { p++; if (*p == '=') { p++; t.value = TOK.powAssign; // ^^= } else t.value = TOK.pow; // ^^ } else if (*p == '=') { p++; t.value = TOK.xorAssign; // ^= } else t.value = TOK.xor; // ^ return; case '(': p++; t.value = TOK.leftParenthesis; return; case ')': p++; t.value = TOK.rightParenthesis; return; case '[': p++; t.value = TOK.leftBracket; return; case ']': p++; t.value = TOK.rightBracket; return; case '{': p++; t.value = TOK.leftCurly; return; case '}': p++; t.value = TOK.rightCurly; return; case '?': p++; t.value = TOK.question; return; case ',': p++; t.value = TOK.comma; return; case ';': p++; t.value = TOK.semicolon; return; case ':': p++; if (*p == ':') { ++p; t.value = TOK.colonColon; } else if (*p == '>' && Ccompile) { ++p; t.value = TOK.rightBracket; } else t.value = TOK.colon; return; case '$': p++; t.value = TOK.dollar; return; case '@': p++; t.value = TOK.at; return; case '*': p++; if (*p == '=') { p++; t.value = TOK.mulAssign; } else t.value = TOK.mul; return; case '%': p++; if (*p == '=') { p++; t.value = TOK.modAssign; } else if (*p == '>' && Ccompile) { ++p; t.value = TOK.rightCurly; } else if (*p == ':' && Ccompile) { goto case '#'; // %: means # } else t.value = TOK.mod; return; case '#': { // https://issues.dlang.org/show_bug.cgi?id=22825 // Special token sequences are terminated by newlines, // and should not be skipped over. this.tokenizeNewlines = true; p++; if (parseSpecialTokenSequence()) continue; t.value = TOK.pound; return; } default: { dchar c = *p; if (c & 0x80) { c = decodeUTF(); // Check for start of unicode identifier if (isUniAlpha(c)) goto case_ident; if (c == PS || c == LS) { endOfLine(); p++; if (tokenizeNewlines) { t.value = TOK.endOfLine; tokenizeNewlines = false; return; } continue; } } if (c < 0x80 && isprint(c)) error("character '%c' is not a valid token", c); else error("character 0x%02x is not a valid token", c); p++; continue; } } } } final Token* peek(Token* ct) { Token* t; if (ct.next) t = ct.next; else { t = allocateToken(); scan(t); ct.next = t; } return t; } /********************************* * tk is on the opening (. * Look ahead and return token that is past the closing ). */ final Token* peekPastParen(Token* tk) { //printf("peekPastParen()\n"); int parens = 1; int curlynest = 0; while (1) { tk = peek(tk); //tk.print(); switch (tk.value) { case TOK.leftParenthesis: parens++; continue; case TOK.rightParenthesis: --parens; if (parens) continue; tk = peek(tk); break; case TOK.leftCurly: curlynest++; continue; case TOK.rightCurly: if (--curlynest >= 0) continue; break; case TOK.semicolon: if (curlynest) continue; break; case TOK.endOfFile: break; default: continue; } return tk; } } /******************************************* * Parse escape sequence. */ private uint escapeSequence() { return Lexer.escapeSequence(token.loc, p, Ccompile); } /******** * Parse the given string literal escape sequence into a single character. * D https://dlang.org/spec/lex.html#escape_sequences * C11 6.4.4.4 * Params: * loc = location to use for error messages * sequence = pointer to string with escape sequence to parse. Updated to * point past the end of the escape sequence * Ccompile = true for compile C11 escape sequences * Returns: * the escape sequence as a single character */ private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile) { const(char)* p = sequence; // cache sequence reference on stack scope(exit) sequence = p; uint c = *p; int ndigits; switch (c) { case '\'': case '"': case '?': case '\\': Lconsume: p++; break; case 'a': c = 7; goto Lconsume; case 'b': c = 8; goto Lconsume; case 'f': c = 12; goto Lconsume; case 'n': c = 10; goto Lconsume; case 'r': c = 13; goto Lconsume; case 't': c = 9; goto Lconsume; case 'v': c = 11; goto Lconsume; case 'u': ndigits = 4; goto Lhex; case 'U': ndigits = 8; goto Lhex; case 'x': ndigits = 2; Lhex: p++; c = *p; if (ishex(cast(char)c)) { uint v = 0; int n = 0; if (Ccompile && ndigits == 2) { /* C11 6.4.4.4-7 one to infinity hex digits */ do { if (isdigit(cast(char)c)) c -= '0'; else if (islower(c)) c -= 'a' - 10; else c -= 'A' - 10; v = v * 16 + c; c = *++p; } while (ishex(cast(char)c)); } else { while (1) { if (isdigit(cast(char)c)) c -= '0'; else if (islower(c)) c -= 'a' - 10; else c -= 'A' - 10; v = v * 16 + c; c = *++p; if (++n == ndigits) break; if (!ishex(cast(char)c)) { .error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits); break; } } if (ndigits != 2 && !utf_isValidDchar(v)) { .error(loc, "invalid UTF character \\U%08x", v); v = '?'; // recover with valid UTF character } } c = v; } else { .error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c); p++; } break; case '&': if (Ccompile) goto default; // named character entity for (const idstart = ++p; 1; p++) { switch (*p) { case ';': c = HtmlNamedEntity(idstart, p - idstart); if (c == ~0) { .error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); c = '?'; } p++; break; default: if (isalpha(*p) || (p != idstart && isdigit(*p))) continue; .error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); c = '?'; break; } break; } break; case 0: case 0x1A: // end of file c = '\\'; break; default: if (isoctal(cast(char)c)) { uint v = 0; int n = 0; do { v = v * 8 + (c - '0'); c = *++p; } while (++n < 3 && isoctal(cast(char)c)); c = v; if (c > 0xFF) .error(loc, "escape octal sequence \\%03o is larger than \\377", c); } else { .error(loc, "undefined escape sequence \\%c", c); p++; } break; } return c; } /** Lex a wysiwyg string. `p` must be pointing to the first character before the contents of the string literal. The character pointed to by `p` will be used as the terminating character (i.e. backtick or double-quote). Params: result = pointer to the token that accepts the result */ private void wysiwygStringConstant(Token* result) { result.value = TOK.string_; Loc start = loc(); auto terminator = p[0]; p++; stringbuffer.setsize(0); while (1) { dchar c = p[0]; p++; switch (c) { case '\n': endOfLine(); break; case '\r': if (p[0] == '\n') continue; // ignore c = '\n'; // treat EndOfLine as \n character endOfLine(); break; case 0: case 0x1A: error("unterminated string constant starting at %s", start.toChars()); result.setString(); // rewind `p` so it points to the EOF character p--; return; default: if (c == terminator) { result.setString(stringbuffer); stringPostfix(result); return; } else if (c & 0x80) { p--; const u = decodeUTF(); p++; if (u == PS || u == LS) endOfLine(); stringbuffer.writeUTF8(u); continue; } break; } stringbuffer.writeByte(c); } } /** Lex a delimited string. Some examples of delimited strings are: --- q"(foo(xxx))" // "foo(xxx)" q"[foo$(LPAREN)]" // "foo$(LPAREN)" q"/foo]/" // "foo]" q"HERE foo HERE" // "foo\n" --- It is assumed that `p` points to the opening double-quote '"'. Params: result = pointer to the token that accepts the result */ private void delimitedStringConstant(Token* result) { result.value = TOK.string_; Loc start = loc(); dchar delimleft = 0; dchar delimright = 0; uint nest = 1; uint nestcount = ~0; // dead assignment, needed to suppress warning Identifier hereid = null; uint blankrol = 0; uint startline = 0; p++; stringbuffer.setsize(0); while (1) { dchar c = *p++; //printf("c = '%c'\n", c); switch (c) { case '\n': Lnextline: endOfLine(); startline = 1; if (blankrol) { blankrol = 0; continue; } if (hereid) { stringbuffer.writeUTF8(c); continue; } break; case '\r': if (*p == '\n') continue; // ignore c = '\n'; // treat EndOfLine as \n character goto Lnextline; case 0: case 0x1A: error("unterminated delimited string constant starting at %s", start.toChars()); result.setString(); // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). p--; return; default: if (c & 0x80) { p--; c = decodeUTF(); p++; if (c == PS || c == LS) goto Lnextline; } break; } if (delimleft == 0) { delimleft = c; nest = 1; nestcount = 1; if (c == '(') delimright = ')'; else if (c == '{') delimright = '}'; else if (c == '[') delimright = ']'; else if (c == '<') delimright = '>'; else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) { // Start of identifier; must be a heredoc Token tok; p--; scan(&tok); // read in heredoc identifier if (tok.value != TOK.identifier) { error("identifier expected for heredoc, not %s", tok.toChars()); delimright = c; } else { hereid = tok.ident; //printf("hereid = '%s'\n", hereid.toChars()); blankrol = 1; } nest = 0; } else { delimright = c; nest = 0; if (isspace(c)) error("delimiter cannot be whitespace"); } } else { if (blankrol) { error("heredoc rest of line should be blank"); blankrol = 0; continue; } if (nest == 1) { if (c == delimleft) nestcount++; else if (c == delimright) { nestcount--; if (nestcount == 0) goto Ldone; } } else if (c == delimright) goto Ldone; if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid) { Token tok; auto psave = p; p--; scan(&tok); // read in possible heredoc identifier //printf("endid = '%s'\n", tok.ident.toChars()); if (tok.value == TOK.identifier && tok.ident is hereid) { /* should check that rest of line is blank */ goto Ldone; } p = psave; } stringbuffer.writeUTF8(c); startline = 0; } } Ldone: if (*p == '"') p++; else if (hereid) error("delimited string must end in `%s\"`", hereid.toChars()); else if (isspace(delimright)) error("delimited string must end in `\"`"); else error("delimited string must end in `%c\"`", delimright); result.setString(stringbuffer); stringPostfix(result); } /** Lex a token string. Some examples of token strings are: --- q{ foo(xxx) } // " foo(xxx) " q{foo$(LPAREN)} // "foo$(LPAREN)" q{{foo}"}"} // "{foo}"}"" --- It is assumed that `p` points to the opening curly-brace. Params: result = pointer to the token that accepts the result */ private void tokenStringConstant(Token* result) { result.value = TOK.string_; uint nest = 1; const start = loc(); const pstart = ++p; inTokenStringConstant++; scope(exit) inTokenStringConstant--; while (1) { Token tok; scan(&tok); switch (tok.value) { case TOK.leftCurly: nest++; continue; case TOK.rightCurly: if (--nest == 0) { result.setString(pstart, p - 1 - pstart); stringPostfix(result); return; } continue; case TOK.endOfFile: error("unterminated token string constant starting at %s", start.toChars()); result.setString(); return; default: continue; } } } /** Scan a quoted string while building the processed string value by handling escape sequences. The result is returned in the given `t` token. This function assumes that `p` currently points to the opening quote of the string. Params: t = the token to set the resulting string to * References: * D https://dlang.org/spec/lex.html#double_quoted_strings * ImportC C11 6.4.5 */ private void escapeStringConstant(Token* t) { t.value = TOK.string_; const start = loc(); const tc = *p++; // opening quote stringbuffer.setsize(0); while (1) { dchar c = *p++; switch (c) { case '\\': switch (*p) { case '&': if (Ccompile) goto default; goto case; case 'u': case 'U': c = escapeSequence(); stringbuffer.writeUTF8(c); continue; default: c = escapeSequence(); break; } break; case '\n': endOfLine(); if (Ccompile) goto Lunterminated; break; case '\r': if (*p == '\n') continue; // ignore c = '\n'; // treat EndOfLine as \n character endOfLine(); if (Ccompile) goto Lunterminated; break; case '\'': case '"': if (c != tc) goto default; t.setString(stringbuffer); if (!Ccompile) stringPostfix(t); return; case 0: case 0x1A: // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). p--; Lunterminated: error("unterminated string constant starting at %s", start.toChars()); t.setString(); return; default: if (c & 0x80) { p--; c = decodeUTF(); if (c == LS || c == PS) { c = '\n'; endOfLine(); if (Ccompile) goto Lunterminated; } p++; stringbuffer.writeUTF8(c); continue; } break; } stringbuffer.writeByte(c); } } /************************************** * Reference: * https://dlang.org/spec/lex.html#characterliteral */ private TOK charConstant(Token* t) { TOK tk = TOK.charLiteral; //printf("Lexer::charConstant\n"); p++; dchar c = *p++; switch (c) { case '\\': switch (*p) { case 'u': t.unsvalue = escapeSequence(); tk = TOK.wcharLiteral; break; case 'U': case '&': t.unsvalue = escapeSequence(); tk = TOK.dcharLiteral; break; default: t.unsvalue = escapeSequence(); break; } break; case '\n': L1: endOfLine(); goto case; case '\r': goto case '\''; case 0: case 0x1A: // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). p--; goto case; case '\'': error("unterminated character constant"); t.unsvalue = '?'; return tk; default: if (c & 0x80) { p--; c = decodeUTF(); p++; if (c == LS || c == PS) goto L1; if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) tk = TOK.wcharLiteral; else tk = TOK.dcharLiteral; } t.unsvalue = c; break; } if (*p != '\'') { while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' && *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}') { if (*p & 0x80) { const s = p; c = decodeUTF(); if (c == LS || c == PS) { p = s; break; } } p++; } if (*p == '\'') { error("character constant has multiple characters"); p++; } else error("unterminated character constant"); t.unsvalue = '?'; return tk; } p++; return tk; } /*************************************** * Lex C character constant. * Parser is on the opening quote. * Params: * t = token to fill in * prefix = one of `u`, `U` or 0. * Reference: * C11 6.4.4.4 */ private void clexerCharConstant(ref Token t, char prefix) { escapeStringConstant(&t); const(char)[] str = t.ustring[0 .. t.len]; const n = str.length; const loc = t.loc; if (n == 0) { error(loc, "empty character constant"); t.value = TOK.semicolon; return; } uint u; switch (prefix) { case 0: if (n == 1) // fast case { u = str[0]; } else if (n > 4) error(loc, "max number of chars in character literal is 4, had %d", cast(int)n); else { foreach (i, c; str) (cast(char*)&u)[n - 1 - i] = c; } break; case 'u': dchar d1; size_t idx; auto msg = utf_decodeChar(str, idx, d1); dchar d2 = 0; if (idx < n && !msg) msg = utf_decodeChar(str, idx, d2); if (msg) error(loc, "%s", msg); else if (idx < n) error(loc, "max number of chars in 16 bit character literal is 2, had %d", (n + 1) >> 1); else if (d1 > 0x1_0000) error(loc, "%d does not fit in 16 bits", d1); else if (d2 > 0x1_0000) error(loc, "%d does not fit in 16 bits", d2); u = d1; if (d2) u = (d1 << 16) | d2; break; case 'U': dchar d; size_t idx; auto msg = utf_decodeChar(str, idx, d); if (msg) error(loc, "%s", msg); else if (idx < n) error(loc, "max number of chars in 32 bit character literal is 1, had %d", (n + 3) >> 2); u = d; break; default: assert(0); } t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal; t.unsvalue = u; } /*************************************** * Get postfix of string literal. */ private void stringPostfix(Token* t) pure @nogc { switch (*p) { case 'c': case 'w': case 'd': t.postfix = *p; p++; break; default: t.postfix = 0; break; } } /************************************** * Read in a number. * If it's an integer, store it in tok.TKutok.Vlong. * integers can be decimal, octal or hex * Handle the suffixes U, UL, LU, L, etc. * If it's double, store it in tok.TKutok.Vdouble. * Returns: * TKnum * TKdouble,... */ private TOK number(Token* t) { int base = 10; const start = p; uinteger_t n = 0; // unsigned >=64 bit integer type int d; bool err = false; bool overflow = false; bool anyBinaryDigitsNoSingleUS = false; bool anyHexDigitsNoSingleUS = false; char errorDigit = 0; dchar c = *p; if (c == '0') { ++p; c = *p; switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': base = 8; break; case '8': case '9': errorDigit = cast(char) c; base = 8; break; case 'x': case 'X': ++p; base = 16; break; case 'b': case 'B': if (Ccompile) error("binary constants not allowed"); ++p; base = 2; break; case '.': if (p[1] == '.') goto Ldone; // if ".." if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) { if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L')) goto Lreal; // if `0.f` or `0.L` goto Ldone; // if ".identifier" or ".unicode" } goto Lreal; // '.' is part of current token case 'i': case 'f': case 'F': goto Lreal; case '_': if (Ccompile) error("embedded `_` not allowed"); ++p; base = 8; break; case 'L': if (p[1] == 'i') goto Lreal; break; default: break; } } while (1) { c = *p; switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': ++p; d = c - '0'; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': ++p; if (base != 16) { if (c == 'e' || c == 'E' || c == 'f' || c == 'F') goto Lreal; } if (c >= 'a') d = c + 10 - 'a'; else d = c + 10 - 'A'; break; case 'L': if (p[1] == 'i') goto Lreal; goto Ldone; case '.': if (p[1] == '.') goto Ldone; // if ".." if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) { if (Ccompile && base == 10 && (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L')) goto Lreal; // if `1.e6` or `1.f` or `1.L` goto Ldone; // if ".identifier" or ".unicode" } if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80)) goto Ldone; // if ".identifier" or ".unicode" if (base == 2) goto Ldone; // if ".identifier" or ".unicode" goto Lreal; // otherwise as part of a floating point literal case 'p': case 'P': case 'i': Lreal: p = start; return inreal(t); case '_': if (Ccompile) goto default; ++p; continue; default: goto Ldone; } // got a digit here, set any necessary flags, check for errors anyHexDigitsNoSingleUS = true; anyBinaryDigitsNoSingleUS = true; if (!errorDigit && d >= base) { errorDigit = cast(char) c; } // Avoid expensive overflow check if we aren't at risk of overflow if (n <= 0x0FFF_FFFF_FFFF_FFFFUL) n = n * base + d; else { import core.checkedint : mulu, addu; n = mulu(n, base, overflow); n = addu(n, d, overflow); } } Ldone: if (errorDigit) { error("%s digit expected, not `%c`", base == 2 ? "binary".ptr : base == 8 ? "octal".ptr : "decimal".ptr, errorDigit); err = true; } if (overflow && !err) { error("integer overflow"); err = true; } if ((base == 2 && !anyBinaryDigitsNoSingleUS) || (base == 16 && !anyHexDigitsNoSingleUS)) error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start); t.unsvalue = n; if (Ccompile) return cnumber(base, n); enum FLAGS : int { none = 0, decimal = 1, // decimal unsigned = 2, // u or U suffix long_ = 4, // L suffix } FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none; // Parse trailing 'u', 'U', 'l' or 'L' in any combination const psuffix = p; while (1) { FLAGS f; switch (*p) { case 'U': case 'u': f = FLAGS.unsigned; goto L1; case 'l': f = FLAGS.long_; error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); goto L1; case 'L': f = FLAGS.long_; L1: p++; if ((flags & f) && !err) { error("unrecognized token"); err = true; } flags = cast(FLAGS)(flags | f); continue; default: break; } break; } if (base == 8 && n >= 8) { if (err) // can't translate invalid octal value, just show a generic message error("octal literals larger than 7 are no longer supported"); else error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead", n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix); } TOK result; switch (flags) { case FLAGS.none: /* Octal or Hexadecimal constant. * First that fits: int, uint, long, ulong */ if (n & 0x8000000000000000L) result = TOK.uns64Literal; else if (n & 0xFFFFFFFF00000000L) result = TOK.int64Literal; else if (n & 0x80000000) result = TOK.uns32Literal; else result = TOK.int32Literal; break; case FLAGS.decimal: /* First that fits: int, long, long long */ if (n & 0x8000000000000000L) { result = TOK.uns64Literal; } else if (n & 0xFFFFFFFF80000000L) result = TOK.int64Literal; else result = TOK.int32Literal; break; case FLAGS.unsigned: case FLAGS.decimal | FLAGS.unsigned: /* First that fits: uint, ulong */ if (n & 0xFFFFFFFF00000000L) result = TOK.uns64Literal; else result = TOK.uns32Literal; break; case FLAGS.decimal | FLAGS.long_: if (n & 0x8000000000000000L) { if (!err) { error("signed integer overflow"); err = true; } result = TOK.uns64Literal; } else result = TOK.int64Literal; break; case FLAGS.long_: if (n & 0x8000000000000000L) result = TOK.uns64Literal; else result = TOK.int64Literal; break; case FLAGS.unsigned | FLAGS.long_: case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: result = TOK.uns64Literal; break; default: debug { printf("%x\n", flags); } assert(0); } return result; } /************************************** * Lex C integer-suffix * Params: * base = number base * n = raw integer value * Returns: * token value */ private TOK cnumber(int base, uinteger_t n) { /* C11 6.4.4.1 * Parse trailing suffixes: * u or U * l or L * ll or LL */ enum FLAGS : uint { octalhex = 1, // octal or hexadecimal decimal = 2, // decimal unsigned = 4, // u or U suffix long_ = 8, // l or L suffix llong = 0x10 // ll or LL } FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex; bool err; Lsuffixes: while (1) { FLAGS f; const cs = *p; switch (cs) { case 'U': case 'u': f = FLAGS.unsigned; break; case 'l': case 'L': f = FLAGS.long_; if (cs == p[1]) { f = FLAGS.long_ | FLAGS.llong; ++p; } break; default: break Lsuffixes; } ++p; if ((flags & f) && !err) { error("duplicate integer suffixes"); err = true; } flags = cast(FLAGS)(flags | f); } TOK result = TOK.int32Literal; // default switch (flags) { /* Since D doesn't have a variable sized `long` or `unsigned long` type, * this code deviates from C by picking D int, uint, long, or ulong instead */ case FLAGS.octalhex: /* Octal or Hexadecimal constant. * First that fits: int, unsigned, long, unsigned long, * long long, unsigned long long */ if (n & 0x8000000000000000L) result = TOK.uns64Literal; // unsigned long else if (n & 0xFFFFFFFF00000000L) result = TOK.int64Literal; // long else if (n & 0x80000000) result = TOK.uns32Literal; else result = TOK.int32Literal; break; case FLAGS.decimal: /* First that fits: int, long, long long */ if (n & 0x8000000000000000L) result = TOK.uns64Literal; // unsigned long else if (n & 0xFFFFFFFF80000000L) result = TOK.int64Literal; // long else result = TOK.int32Literal; break; case FLAGS.octalhex | FLAGS.unsigned: case FLAGS.decimal | FLAGS.unsigned: /* First that fits: unsigned, unsigned long, unsigned long long */ if (n & 0xFFFFFFFF00000000L) result = TOK.uns64Literal; // unsigned long else result = TOK.uns32Literal; break; case FLAGS.decimal | FLAGS.long_: /* First that fits: long, long long */ if (longsize == 4 || long_longsize == 4) { if (n & 0xFFFFFFFF_80000000L) result = TOK.int64Literal; else result = TOK.int32Literal; // long } else { result = TOK.int64Literal; // long } break; case FLAGS.octalhex | FLAGS.long_: /* First that fits: long, unsigned long, long long, * unsigned long long */ if (longsize == 4 || long_longsize == 4) { if (n & 0x8000000000000000L) result = TOK.uns64Literal; else if (n & 0xFFFFFFFF00000000L) result = TOK.int64Literal; else if (n & 0x80000000) result = TOK.uns32Literal; // unsigned long else result = TOK.int32Literal; // long } else { if (n & 0x80000000_00000000L) result = TOK.uns64Literal; // unsigned long else result = TOK.int64Literal; // long } break; case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_: case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: /* First that fits: unsigned long, unsigned long long */ if (longsize == 4 || long_longsize == 4) { if (n & 0xFFFFFFFF00000000L) result = TOK.uns64Literal; else result = TOK.uns32Literal; // unsigned long } else { result = TOK.uns64Literal; // unsigned long } break; case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong: /* First that fits: long long, unsigned long long */ if (n & 0x8000000000000000L) result = TOK.uns64Literal; else result = TOK.int64Literal; break; case FLAGS.decimal | FLAGS.long_ | FLAGS.llong: /* long long */ result = TOK.int64Literal; break; case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong: case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong: result = TOK.uns64Literal; break; default: debug printf("%x\n",flags); assert(0); } return result; } /************************************** * Read in characters, converting them to real. * Bugs: * Exponent overflow not detected. * Too much requested precision is not detected. */ private TOK inreal(Token* t) { //printf("Lexer::inreal()\n"); debug { assert(*p == '.' || isdigit(*p)); } bool isWellformedString = true; stringbuffer.setsize(0); auto pstart = p; bool hex = false; dchar c = *p++; // Leading '0x' if (c == '0') { c = *p++; if (c == 'x' || c == 'X') { hex = true; c = *p++; } } // Digits to left of '.' while (1) { if (c == '.') { c = *p++; break; } if (isdigit(c) || (hex && isxdigit(c)) || c == '_') { c = *p++; continue; } break; } // Digits to right of '.' while (1) { if (isdigit(c) || (hex && isxdigit(c)) || c == '_') { c = *p++; continue; } break; } if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) { c = *p++; if (c == '-' || c == '+') { c = *p++; } bool anyexp = false; while (1) { if (isdigit(c)) { anyexp = true; c = *p++; continue; } if (c == '_') { if (Ccompile) error("embedded `_` in numeric literals not allowed"); c = *p++; continue; } if (!anyexp) { error("missing exponent"); isWellformedString = false; } break; } } else if (hex) { error("exponent required for hex float"); isWellformedString = false; } --p; while (pstart < p) { if (*pstart != '_') stringbuffer.writeByte(*pstart); ++pstart; } stringbuffer.writeByte(0); auto sbufptr = cast(const(char)*)stringbuffer[].ptr; TOK result; bool isOutOfRange = false; t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero); switch (*p) { case 'F': case 'f': if (isWellformedString && !isOutOfRange) isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr); result = TOK.float32Literal; p++; break; default: if (isWellformedString && !isOutOfRange) isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr); result = TOK.float64Literal; break; case 'l': if (!Ccompile) error("use 'L' suffix instead of 'l'"); goto case 'L'; case 'L': ++p; if (Ccompile && long_doublesize == 8) goto default; result = TOK.float80Literal; break; } if ((*p == 'i' || *p == 'I') && !Ccompile) { if (*p == 'I') error("use 'i' suffix instead of 'I'"); p++; switch (result) { case TOK.float32Literal: result = TOK.imaginary32Literal; break; case TOK.float64Literal: result = TOK.imaginary64Literal; break; case TOK.float80Literal: result = TOK.imaginary80Literal; break; default: break; } } const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal); if (isOutOfRange && !isLong && (!Ccompile || hex)) { /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex */ const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : ""; error(scanloc, "number `%s%s` is not representable", sbufptr, suffix); } debug { switch (result) { case TOK.float32Literal: case TOK.float64Literal: case TOK.float80Literal: case TOK.imaginary32Literal: case TOK.imaginary64Literal: case TOK.imaginary80Literal: break; default: assert(0); } } return result; } final Loc loc() pure @nogc { scanloc.charnum = cast(uint)(1 + p - line); version (LocOffset) scanloc.fileOffset = cast(uint)(p - base); return scanloc; } final void error(const(char)* format, ...) { va_list args; va_start(args, format); .verror(token.loc, format, args); va_end(args); } final void error(const ref Loc loc, const(char)* format, ...) { va_list args; va_start(args, format); .verror(loc, format, args); va_end(args); } final void deprecation(const(char)* format, ...) { va_list args; va_start(args, format); .vdeprecation(token.loc, format, args); va_end(args); } /*************************************** * Parse special token sequence: * Returns: * true if the special token sequence was handled * References: * https://dlang.org/spec/lex.html#special-token-sequence */ bool parseSpecialTokenSequence() { Token n; scan(&n); if (n.value == TOK.identifier) { if (n.ident == Id.line) { poundLine(n, false); return true; } else { const locx = loc(); warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars()); } } else if (n.value == TOK.if_) { error("C preprocessor directive `#if` is not supported, use `version` or `static if`"); } return false; } /********************************************* * Parse line/file preprocessor directive: * #line linnum [filespec] * Allow __LINE__ for linnum, and __FILE__ for filespec. * Accept linemarker format: * # linnum [filespec] {flags} * There can be zero or more flags, which are one of the digits 1..4, and * must be in ascending order. The flags are ignored. * Params: * tok = token we're on, which is linnum of linemarker * linemarker = true if line marker format and lexer is on linnum * References: * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html */ final void poundLine(ref Token tok, bool linemarker) { auto linnum = this.scanloc.linnum; const(char)* filespec = null; bool flags; if (!linemarker) scan(&tok); if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal) { const lin = cast(int)(tok.unsvalue); if (lin != tok.unsvalue) { error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue); skipToNextLine(); return; } else linnum = lin; } else if (tok.value == TOK.line) // #line __LINE__ { } else { error(tok.loc, "positive integer argument expected following `#line`"); if (tok.value != TOK.endOfLine) skipToNextLine(); return; } while (1) { scan(&tok); switch (tok.value) { case TOK.endOfFile: case TOK.endOfLine: if (!inTokenStringConstant) { this.scanloc.linnum = linnum; if (filespec) this.scanloc.filename = filespec; } return; case TOK.file: if (filespec || flags) goto Lerr; filespec = mem.xstrdup(scanloc.filename); continue; case TOK.string_: if (filespec || flags) goto Lerr; if (tok.ptr[0] != '"' || tok.postfix != 0) goto Lerr; filespec = tok.ustring; continue; case TOK.int32Literal: if (!filespec) goto Lerr; if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4) { flags = true; // linemarker flags seen continue; } goto Lerr; default: goto Lerr; } } Lerr: if (filespec is null) error(tok.loc, "invalid filename for `#line` directive"); else if (linemarker) error(tok.loc, "invalid flag for line marker directive"); else if (!Ccompile) error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars()); if (tok.value != TOK.endOfLine) skipToNextLine(); } /*************************************** * Scan forward to start of next line. */ final void skipToNextLine() { while (1) { switch (*p) { case 0: case 0x1A: return; // do not advance p case '\n': ++p; break; case '\r': ++p; if (p[0] == '\n') ++p; break; default: if (*p & 0x80) { const u = decodeUTF(); if (u == PS || u == LS) { ++p; break; } } ++p; continue; } break; } endOfLine(); tokenizeNewlines = false; } /******************************************** * Decode UTF character. * Issue error messages for invalid sequences. * Return decoded character, advance p to last character in UTF sequence. */ private uint decodeUTF() { const s = p; assert(*s & 0x80); // Check length of remaining string up to 4 UTF-8 characters size_t len; for (len = 1; len < 4 && s[len]; len++) { } size_t idx = 0; dchar u; const msg = utf_decodeChar(s[0 .. len], idx, u); p += idx - 1; if (msg) { error("%.*s", cast(int)msg.length, msg.ptr); } return u; } /*************************************************** * Parse doc comment embedded between t.ptr and p. * Remove trailing blanks and tabs from lines. * Replace all newlines with \n. * Remove leading comment character from each line. * Decide if it's a lineComment or a blockComment. * Append to previous one for this token. * * If newParagraph is true, an extra newline will be * added between adjoining doc comments. */ private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure { /* ct tells us which kind of comment it is: '/', '*', or '+' */ const ct = t.ptr[2]; /* Start of comment text skips over / * *, / + +, or / / / */ const(char)* q = t.ptr + 3; // start of comment text const(char)* qend = p; if (ct == '*' || ct == '+') qend -= 2; /* Scan over initial row of ****'s or ++++'s or ////'s */ for (; q < qend; q++) { if (*q != ct) break; } /* Remove leading spaces until start of the comment */ int linestart = 0; if (ct == '/') { while (q < qend && (*q == ' ' || *q == '\t')) ++q; } else if (q < qend) { if (*q == '\r') { ++q; if (q < qend && *q == '\n') ++q; linestart = 1; } else if (*q == '\n') { ++q; linestart = 1; } } /* Remove trailing row of ****'s or ++++'s */ if (ct != '/') { for (; q < qend; qend--) { if (qend[-1] != ct) break; } } /* Comment is now [q .. qend]. * Canonicalize it into buf[]. */ OutBuffer buf; void trimTrailingWhitespace() { const s = buf[]; auto len = s.length; while (len && (s[len - 1] == ' ' || s[len - 1] == '\t')) --len; buf.setsize(len); } for (; q < qend; q++) { char c = *q; switch (c) { case '*': case '+': if (linestart && c == ct) { linestart = 0; /* Trim preceding whitespace up to preceding \n */ trimTrailingWhitespace(); continue; } break; case ' ': case '\t': break; case '\r': if (q[1] == '\n') continue; // skip the \r goto Lnewline; default: if (c == 226) { // If LS or PS if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) { q += 2; goto Lnewline; } } linestart = 0; break; Lnewline: c = '\n'; // replace all newlines with \n goto case; case '\n': linestart = 1; /* Trim trailing whitespace */ trimTrailingWhitespace(); break; } buf.writeByte(c); } /* Trim trailing whitespace (if the last line does not have newline) */ trimTrailingWhitespace(); // Always end with a newline const s = buf[]; if (s.length == 0 || s[$ - 1] != '\n') buf.writeByte('\n'); // It's a line comment if the start of the doc comment comes // after other non-whitespace on the same line. auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; // Combine with previous doc comment, if any if (*dc) *dc = combineComments(*dc, buf[], newParagraph).toDString(); else *dc = buf.extractSlice(true); } /******************************************** * Combine two document comments into one, * separated by an extra newline if newParagraph is true. */ static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure { //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph); const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n' if (!c1) return c2.ptr; if (!c2) return c1.ptr; int insertNewLine = 0; if (c1.length && c1[$ - 1] != '\n') insertNewLine = 1; const retSize = c1.length + insertNewLine + newParagraphSize + c2.length; auto p = cast(char*)mem.xmalloc_noscan(retSize + 1); p[0 .. c1.length] = c1[]; if (insertNewLine) p[c1.length] = '\n'; if (newParagraph) p[c1.length + insertNewLine] = '\n'; p[retSize - c2.length .. retSize] = c2[]; p[retSize] = 0; return p; } /************************** * `p` should be at start of next line */ private void endOfLine() pure @nogc @safe { scanloc.linnum++; line = p; } } /******************************* Private *****************************************/ private: /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__` private struct TimeStampInfo { private __gshared bool initdone = false; // Note: Those properties need to be guarded by a call to `init` // The API isn't safe, and quite brittle, but it was left this way // over performance concerns. // This is currently only called once, from the lexer. __gshared char[11 + 1] date; __gshared char[8 + 1] time; __gshared char[24 + 1] timestamp; public static void initialize(const ref Loc loc) nothrow { if (initdone) return; initdone = true; time_t ct; // https://issues.dlang.org/show_bug.cgi?id=20444 if (auto p = getenv("SOURCE_DATE_EPOCH")) { if (!ct.parseDigits(p.toDString())) error(loc, "value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p); } else .time(&ct); const p = ctime(&ct); assert(p); sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); sprintf(&time[0], "%.8s", p + 11); sprintf(×tamp[0], "%.24s", p); } } private enum LS = 0x2028; // UTF line separator private enum PS = 0x2029; // UTF paragraph separator /******************************************** * Do our own char maps */ private static immutable cmtable = () { ubyte[256] table; foreach (const c; 0 .. table.length) { if ('0' <= c && c <= '7') table[c] |= CMoctal; if (c_isxdigit(c)) table[c] |= CMhex; if (c_isalnum(c) || c == '_') table[c] |= CMidchar; switch (c) { case 'x': case 'X': case 'b': case 'B': table[c] |= CMzerosecond; break; case '0': .. case '9': case 'e': case 'E': case 'f': case 'F': case 'l': case 'L': case 'p': case 'P': case 'u': case 'U': case 'i': case '.': case '_': table[c] |= CMzerosecond | CMdigitsecond; break; default: break; } switch (c) { case '\\': case '\n': case '\r': case 0: case 0x1A: case '\'': break; default: if (!(c & 0x80)) table[c] |= CMsinglechar; break; } } return table; }(); private { enum CMoctal = 0x1; enum CMhex = 0x2; enum CMidchar = 0x4; enum CMzerosecond = 0x8; enum CMdigitsecond = 0x10; enum CMsinglechar = 0x20; } private bool isoctal(const char c) pure @nogc @safe { return (cmtable[c] & CMoctal) != 0; } private bool ishex(const char c) pure @nogc @safe { return (cmtable[c] & CMhex) != 0; } private bool isidchar(const char c) pure @nogc @safe { return (cmtable[c] & CMidchar) != 0; } private bool isZeroSecond(const char c) pure @nogc @safe { return (cmtable[c] & CMzerosecond) != 0; } private bool isDigitSecond(const char c) pure @nogc @safe { return (cmtable[c] & CMdigitsecond) != 0; } private bool issinglechar(const char c) pure @nogc @safe { return (cmtable[c] & CMsinglechar) != 0; } private bool c_isxdigit(const int c) pure @nogc @safe { return (( c >= '0' && c <= '9') || ( c >= 'a' && c <= 'f') || ( c >= 'A' && c <= 'F')); } private bool c_isalnum(const int c) pure @nogc @safe { return (( c >= '0' && c <= '9') || ( c >= 'a' && c <= 'z') || ( c >= 'A' && c <= 'Z')); } /******************************* Unittest *****************************************/ unittest { import dmd.console; nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header, const(char)* format, va_list ap, const(char)* p1, const(char)* p2) { assert(0); } diagnosticHandler = &assertDiagnosticHandler; static void test(T)(string sequence, T expected, bool Ccompile = false) { auto p = cast(const(char)*)sequence.ptr; assert(expected == Lexer.escapeSequence(Loc.initial, p, Ccompile)); assert(p == sequence.ptr + sequence.length); } test(`'`, '\''); test(`"`, '"'); test(`?`, '?'); test(`\`, '\\'); test(`0`, '\0'); test(`a`, '\a'); test(`b`, '\b'); test(`f`, '\f'); test(`n`, '\n'); test(`r`, '\r'); test(`t`, '\t'); test(`v`, '\v'); test(`x00`, 0x00); test(`xff`, 0xff); test(`xFF`, 0xff); test(`xa7`, 0xa7); test(`x3c`, 0x3c); test(`xe2`, 0xe2); test(`1`, '\1'); test(`42`, '\42'); test(`357`, '\357'); test(`u1234`, '\u1234'); test(`uf0e4`, '\uf0e4'); test(`U0001f603`, '\U0001f603'); test(`"`, '"'); test(`<`, '<'); test(`>`, '>'); diagnosticHandler = null; } unittest { import dmd.console; string expected; bool gotError; nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header, const(char)* format, va_list ap, const(char)* p1, const(char)* p2) { assert(cast(Classification)headerColor == Classification.error); gotError = true; char[100] buffer = void; auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)]; assert(expected == actual); return true; } diagnosticHandler = &expectDiagnosticHandler; void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false) { uint errors = global.errors; gotError = false; expected = expectedError; auto p = cast(const(char)*)sequence.ptr; auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p, Ccompile); assert(gotError); assert(expectedReturnValue == actualReturnValue); auto actualScanLength = p - sequence.ptr; assert(expectedScanLength == actualScanLength); global.errors = errors; } test("c", `undefined escape sequence \c`, 'c', 1); test("!", `undefined escape sequence \!`, '!', 1); test(""", `undefined escape sequence \&`, '&', 1, true); test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2); test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2); test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3); test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4); test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2); test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3); test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4); test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5); test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6); test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7); test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8); test("ud800" , `invalid UTF character \U0000d800`, '?', 5); test("udfff" , `invalid UTF character \U0000dfff`, '?', 5); test("U00110000", `invalid UTF character \U00110000`, '?', 9); test("xg0" , `undefined escape hex sequence \xg`, 'g', 2); test("ug000" , `undefined escape hex sequence \ug`, 'g', 2); test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2); test("&BAD;", `unnamed character entity &BAD;` , '?', 5); test(""", `unterminated named entity "`, '?', 5); test(""", `unterminated named entity "`, '?', 5); test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3); diagnosticHandler = null; } unittest { //printf("lexer.unittest\n"); /* Not much here, just trying things out. */ string text = "int"; // We rely on the implicit null-terminator scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0); TOK tok; tok = lex1.nextToken(); //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32); assert(tok == TOK.int32); tok = lex1.nextToken(); assert(tok == TOK.endOfFile); tok = lex1.nextToken(); assert(tok == TOK.endOfFile); tok = lex1.nextToken(); assert(tok == TOK.endOfFile); } unittest { // We don't want to see Lexer error output during these tests. uint errors = global.startGagging(); scope(exit) global.endGagging(errors); // Test malformed input: even malformed input should end in a TOK.endOfFile. static immutable char[][] testcases = [ // Testcase must end with 0 or 0x1A. [0], // not malformed, but pathological ['\'', 0], ['\'', 0x1A], ['{', '{', 'q', '{', 0], [0xFF, 0], [0xFF, 0x80, 0], [0xFF, 0xFF, 0], [0xFF, 0xFF, 0], ['x', '"', 0x1A], ]; foreach (testcase; testcases) { scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0); TOK tok = lex2.nextToken(); size_t iterations = 1; while ((tok != TOK.endOfFile) && (iterations++ < testcase.length)) { tok = lex2.nextToken(); } assert(tok == TOK.endOfFile); tok = lex2.nextToken(); assert(tok == TOK.endOfFile); } }