diff options
Diffstat (limited to 'idlc/source/preproc/lex.c')
-rw-r--r-- | idlc/source/preproc/lex.c | 695 |
1 files changed, 695 insertions, 0 deletions
diff --git a/idlc/source/preproc/lex.c b/idlc/source/preproc/lex.c new file mode 100644 index 000000000000..fd6d00792984 --- /dev/null +++ b/idlc/source/preproc/lex.c @@ -0,0 +1,695 @@ +/************************************************************************* + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * Copyright 2000, 2010 Oracle and/or its affiliates. + * + * OpenOffice.org - a multi-platform office productivity suite + * + * This file is part of OpenOffice.org. + * + * OpenOffice.org is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3 + * only, as published by the Free Software Foundation. + * + * OpenOffice.org is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License version 3 for more details + * (a copy is included in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU Lesser General Public License + * version 3 along with OpenOffice.org. If not, see + * <http://www.openoffice.org/license.html> + * for a copy of the LGPLv3 License. + * + ************************************************************************/ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#if (defined(_WIN32) || defined(_MSDOS) || defined(__IBMC__)) +#include <io.h> +#else +#include <unistd.h> +#endif +#include "cpp.h" + +/* + * lexical FSM encoding + * when in state state, and one of the characters + * in ch arrives, enter nextstate. + * States >= S_SELF are either final, or at least require special action. + * In 'fsm' there is a line for each state X charset X nextstate. + * List chars that overwrite previous entries later (e.g. C_ALPH + * can be overridden by '_' by a later entry; and C_XX is the + * the universal set, and should always be first. + * States above S_SELF are represented in the big table as negative values. + * S_SELF and S_SELFB encode the resulting token type in the upper bits. + * These actions differ in that S_SELF doesn't have a lookahead char, + * S_SELFB does. + * + * The encoding is blown out into a big table for time-efficiency. + * Entries have + * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits. + */ + +#define MAXSTATE 32 +#define ACT(tok,act) ((tok<<7)+act) +#define QBSBIT 0100 +#define GETACT(st) ((st>>7)&0x1ff) + +/* character classes */ +#define C_WS 1 +#define C_ALPH 2 +#define C_NUM 3 +#define C_EOF 4 +#define C_XX 5 + +enum state +{ + START = 0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4, + CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1, + CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1, + S_SELF = MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR, + S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME +}; + +int tottok; +int tokkind[256]; +struct fsm +{ + int state; /* if in this state */ + uchar ch[4]; /* and see one of these characters */ + int nextstate; /* enter this state if +ve */ +}; + + /*const*/ struct fsm fsm[] = { + /* start state */ + { START, {C_XX}, ACT(UNCLASS, S_SELF) }, + { START, {' ', '\t', '\v'}, WS1 }, + { START, {C_NUM}, NUM1 }, + { START, {'.'}, NUM3 }, + { START, {C_ALPH}, ID1 }, + { START, {'L'}, ST1 }, + { START, {'"'}, ST2 }, + { START, {'\''}, CC1 }, + { START, {'/'}, COM1 }, + { START, {EOFC}, S_EOF }, + { START, {'\n'}, S_NL }, + { START, {'-'}, MINUS1 }, + { START, {'+'}, PLUS1 }, + { START, {'<'}, LT1 }, + { START, {'>'}, GT1 }, + { START, {'='}, ASG1 }, + { START, {'!'}, NOT1 }, + { START, {'&'}, AND1 }, + { START, {'|'}, OR1 }, + { START, {'#'}, SHARP1 }, + { START, {'%'}, PCT1 }, + { START, {'['}, ACT(SBRA, S_SELF) }, + { START, {']'}, ACT(SKET, S_SELF) }, + { START, {'('}, ACT(LP, S_SELF) }, + { START, {')'}, ACT(RP, S_SELF) }, + { START, {'*'}, STAR1 }, + { START, {','}, ACT(COMMA, S_SELF) }, + { START, {'?'}, ACT(QUEST, S_SELF) }, + { START, {':'}, ACT(COLON, S_SELF) }, + { START, {';'}, ACT(SEMIC, S_SELF) }, + { START, {'{'}, ACT(CBRA, S_SELF) }, + { START, {'}'}, ACT(CKET, S_SELF) }, + { START, {'~'}, ACT(TILDE, S_SELF) }, + { START, {'^'}, CIRC1 }, + + /* saw a digit */ + { NUM1, {C_XX}, ACT(NUMBER, S_SELFB) }, + { NUM1, {C_NUM, C_ALPH, '.'}, NUM1 }, + { NUM1, {'E', 'e'}, NUM2 }, + { NUM1, {'_'}, ACT(NUMBER, S_SELFB) }, + + /* saw possible start of exponent, digits-e */ + { NUM2, {C_XX}, ACT(NUMBER, S_SELFB) }, + { NUM2, {'+', '-'}, NUM1 }, + { NUM2, {C_NUM, C_ALPH}, NUM1 }, + { NUM2, {'_'}, ACT(NUMBER, S_SELFB) }, + + /* saw a '.', which could be a number or an operator */ + { NUM3, {C_XX}, ACT(DOT, S_SELFB) }, + { NUM3, {'.'}, DOTS1 }, + { NUM3, {C_NUM}, NUM1 }, + + { DOTS1, {C_XX}, ACT(UNCLASS, S_SELFB) }, + { DOTS1, {C_NUM}, NUM1 }, + { DOTS1, {'.'}, ACT(ELLIPS, S_SELF) }, + + /* saw a letter or _ */ + { ID1, {C_XX}, ACT(NAME, S_NAME) }, + { ID1, {C_ALPH, C_NUM}, ID1 }, + + /* saw L (start of wide string?) */ + { ST1, {C_XX}, ACT(NAME, S_NAME) }, + { ST1, {C_ALPH, C_NUM}, ID1 }, + { ST1, {'"'}, ST2 }, + { ST1, {'\''}, CC1 }, + + /* saw " beginning string */ + { ST2, {C_XX}, ST2 }, + { ST2, {'"'}, ACT(STRING, S_SELF) }, + { ST2, {'\\'}, ST3 }, + { ST2, {'\n'}, S_STNL }, + { ST2, {EOFC}, S_EOFSTR }, + + /* saw \ in string */ + { ST3, {C_XX}, ST2 }, + { ST3, {'\n'}, S_STNL }, + { ST3, {EOFC}, S_EOFSTR }, + + /* saw ' beginning character const */ + { CC1, {C_XX}, CC1 }, + { CC1, {'\''}, ACT(CCON, S_SELF) }, + { CC1, {'\\'}, CC2 }, + { CC1, {'\n'}, S_STNL }, + { CC1, {EOFC}, S_EOFSTR }, + + /* saw \ in ccon */ + { CC2, {C_XX}, CC1 }, + { CC2, {'\n'}, S_STNL }, + { CC2, {EOFC}, S_EOFSTR }, + + /* saw /, perhaps start of comment */ + { COM1, {C_XX}, ACT(SLASH, S_SELFB) }, + { COM1, {'='}, ACT(ASSLASH, S_SELF) }, + { COM1, {'*'}, COM2 }, + { COM1, {'/'}, COM4 }, + + /* saw / followed by *, start of comment */ + { COM2, {C_XX}, COM2 }, + { COM2, {'\n'}, S_COMNL }, + { COM2, {'*'}, COM3 }, + { COM2, {EOFC}, S_EOFCOM }, + + /* saw the * possibly ending a comment */ + { COM3, {C_XX}, COM2 }, + { COM3, {'\n'}, S_COMNL }, + { COM3, {'*'}, COM3 }, + { COM3, {'/'}, S_COMMENT }, + + /* // comment */ + { COM4, {C_XX}, COM4 }, + { COM4, {'\n'}, S_COMMENT }, + /* { COM4, {'\n'}, S_NL }, */ + { COM4, {EOFC}, S_EOFCOM }, + + /* saw white space, eat it up */ + { WS1, {C_XX}, S_WS }, + { WS1, {'\t', '\v', ' '}, WS1 }, + + /* saw -, check --, -=, -> */ + { MINUS1, {C_XX}, ACT(MINUS, S_SELFB) }, + { MINUS1, {'-'}, ACT(MMINUS, S_SELF) }, + { MINUS1, {'='}, ACT(ASMINUS, S_SELF) }, + { MINUS1, {'>'}, ACT(ARROW, S_SELF) }, + + /* saw +, check ++, += */ + { PLUS1, {C_XX}, ACT(PLUS, S_SELFB) }, + { PLUS1, {'+'}, ACT(PPLUS, S_SELF) }, + { PLUS1, {'='}, ACT(ASPLUS, S_SELF) }, + + /* saw <, check <<, <<=, <= */ + { LT1, {C_XX}, ACT(LT, S_SELFB) }, + { LT1, {'<'}, LT2 }, + { LT1, {'='}, ACT(LEQ, S_SELF) }, + { LT2, {C_XX}, ACT(LSH, S_SELFB) }, + { LT2, {'='}, ACT(ASLSH, S_SELF) }, + + /* saw >, check >>, >>=, >= */ + { GT1, {C_XX}, ACT(GT, S_SELFB) }, + { GT1, {'>'}, GT2 }, + { GT1, {'='}, ACT(GEQ, S_SELF) }, + { GT2, {C_XX}, ACT(RSH, S_SELFB) }, + { GT2, {'='}, ACT(ASRSH, S_SELF) }, + + /* = */ + { ASG1, {C_XX}, ACT(ASGN, S_SELFB) }, + { ASG1, {'='}, ACT(EQ, S_SELF) }, + + /* ! */ + { NOT1, {C_XX}, ACT(NOT, S_SELFB) }, + { NOT1, {'='}, ACT(NEQ, S_SELF) }, + + /* & */ + { AND1, {C_XX}, ACT(AND, S_SELFB) }, + { AND1, {'&'}, ACT(LAND, S_SELF) }, + { AND1, {'='}, ACT(ASAND, S_SELF) }, + + /* | */ + { OR1, {C_XX}, ACT(OR, S_SELFB) }, + { OR1, {'|'}, ACT(LOR, S_SELF) }, + { OR1, {'='}, ACT(ASOR, S_SELF) }, + + /* # */ + { SHARP1, {C_XX}, ACT(SHARP, S_SELFB) }, + { SHARP1, {'#'}, ACT(DSHARP, S_SELF) }, + + /* % */ + { PCT1, {C_XX}, ACT(PCT, S_SELFB) }, + { PCT1, {'='}, ACT(ASPCT, S_SELF) }, + + /* * */ + { STAR1, {C_XX}, ACT(STAR, S_SELFB) }, + { STAR1, {'='}, ACT(ASSTAR, S_SELF) }, + + /* ^ */ + { CIRC1, {C_XX}, ACT(CIRC, S_SELFB) }, + { CIRC1, {'='}, ACT(ASCIRC, S_SELF) }, + + { -1, {'\0'}, S_SELF } +}; + +/* first index is char, second is state */ +/* increase #states to power of 2 to encourage use of shift */ +short bigfsm[256][MAXSTATE]; + +void + expandlex(void) +{ + /* const */ struct fsm *fp; + int i, j, nstate; + + for (fp = fsm; fp->state >= 0; fp++) + { + for (i = 0; fp->ch[i]; i++) + { + nstate = fp->nextstate; + if (nstate >= S_SELF) + nstate = ~nstate; + switch (fp->ch[i]) + { + + case C_XX: /* random characters */ + for (j = 0; j < 256; j++) + bigfsm[j][fp->state] = (short) nstate; + continue; + case C_ALPH: + for (j = 0; j <= 256; j++) + if (('a' <= j && j <= 'z') || ('A' <= j && j <= 'Z') + || j == '_') + bigfsm[j][fp->state] = (short) nstate; + continue; + case C_NUM: + for (j = '0'; j <= '9'; j++) + bigfsm[j][fp->state] = (short) nstate; + continue; + default: + bigfsm[fp->ch[i]][fp->state] = (short) nstate; + } + } + } + + /* + * install special cases for ? (trigraphs), \ (splicing), runes, and + * EOB + */ + for (i = 0; i < MAXSTATE; i++) + { + for (j = 0; j < 0xFF; j++) + if (j == '?' || j == '\\' || j == '\n' || j == '\r') + { + if (bigfsm[j][i] > 0) + bigfsm[j][i] = ~bigfsm[j][i]; + bigfsm[j][i] &= ~QBSBIT; + } + bigfsm[EOB][i] = ~S_EOB; + if (bigfsm[EOFC][i] >= 0) + bigfsm[EOFC][i] = ~S_EOF; + } +} + +/* + * fill in a row of tokens from input, terminated by NL or END + * First token is put at trp->lp. + * Reset is non-zero when the input buffer can be "rewound." + * The value is a flag indicating that possible macros have + * been seen in the row. + */ +int + gettokens(Tokenrow * trp, int reset) +{ + register int c, state, oldstate; + register uchar *ip; + register Token *tp, *maxp; + int runelen; + Source *s = cursource; + int nmac = 0; + + tp = trp->lp; + ip = s->inp; + if (reset) + { + s->lineinc = 0; + if (ip >= s->inl) + { /* nothing in buffer */ + s->inl = s->inb; + fillbuf(s); + ip = s->inp = s->inb; + } + else + if (ip >= s->inb + (3 * INS / 4)) + { + memmove(s->inb, ip, 4 + s->inl - ip); + s->inl = s->inb + (s->inl - ip); + ip = s->inp = s->inb; + } + } + maxp = &trp->bp[trp->max]; + runelen = 1; + for (;;) + { +continue2: + if (tp >= maxp) + { + trp->lp = tp; + tp = growtokenrow(trp); + maxp = &trp->bp[trp->max]; + } + tp->type = UNCLASS; + tp->t = ip; + tp->wslen = 0; + tp->flag = 0; + state = START; + for (;;) + { + oldstate = state; + + c = *ip; + + if ((state = bigfsm[c][state]) >= 0) + { + ip += runelen; + runelen = 1; + continue; + } + state = ~state; + reswitch: + switch (state & 0177) + { + case S_SELF: + ip += runelen; + runelen = 1; + case S_SELFB: + tp->type = (unsigned char) GETACT(state); + tp->len = ip - tp->t; + tp++; + goto continue2; + + case S_NAME: /* like S_SELFB but with nmac check */ + tp->type = NAME; + tp->len = ip - tp->t; + nmac |= quicklook(tp->t[0], tp->len > 1 ? tp->t[1] : 0); + tp++; + goto continue2; + + case S_WS: + tp->wslen = ip - tp->t; + tp->t = ip; + state = START; + continue; + + default: + if ((state & QBSBIT) == 0) + { + ip += runelen; + runelen = 1; + continue; + } + state &= ~QBSBIT; + s->inp = ip; + + if (c == '\n') + { + while (s->inp + 1 >= s->inl && fillbuf(s) != EOF); + + if (s->inp[1] == '\r') + { + memmove(s->inp + 1, s->inp + 2, s->inl - s->inp + 2); + s->inl -= 1; + } + + goto reswitch; + } + + if (c == '\r') + { + while (s->inp + 1 >= s->inl && fillbuf(s) != EOF); + + if (s->inp[1] == '\n') + { + memmove(s->inp, s->inp + 1, s->inl - s->inp + 1); + s->inl -= 1; + } + else + *s->inp = '\n'; + + state = oldstate; + continue; + } + + if (c == '?') + { /* check trigraph */ + if (trigraph(s)) + { + state = oldstate; + continue; + } + goto reswitch; + } + if (c == '\\') + { /* line-folding */ + if (foldline(s)) + { + s->lineinc++; + state = oldstate; + continue; + } + goto reswitch; + } + error(WARNING, "Lexical botch in cpp"); + ip += runelen; + runelen = 1; + continue; + + case S_EOB: + s->inp = ip; + fillbuf(cursource); + state = oldstate; + continue; + + case S_EOF: + tp->type = END; + tp->len = 0; + s->inp = ip; + if (tp != trp->bp && (tp - 1)->type != NL && cursource->fd != -1) + error(WARNING, "No newline at end of file"); + trp->lp = tp + 1; + return nmac; + + case S_STNL: + error(ERROR, "Unterminated string or char const"); + case S_NL: + tp->t = ip; + tp->type = NL; + tp->len = 1; + tp->wslen = 0; + s->lineinc++; + s->inp = ip + 1; + trp->lp = tp + 1; + return nmac; + + case S_EOFSTR: + error(FATAL, "EOF in string or char constant"); + break; + + case S_COMNL: + s->lineinc++; + state = COM2; + ip += runelen; + runelen = 1; + continue; + + case S_EOFCOM: + error(WARNING, "EOF inside comment"); + --ip; + case S_COMMENT: + if (!Cflag) + { + tp->t = ++ip; + tp->t[-1] = ' '; + tp->wslen = 1; + state = START; + continue; + } + else + { + runelen = 1; + /* s->lineinc = 0; */ + tp->type = COMMENT; + tp->flag |= XTWS; + } + } + break; + } + ip += runelen; + runelen = 1; + tp->len = ip - tp->t; + tp++; + } +} + +/* have seen ?; handle the trigraph it starts (if any) else 0 */ +int + trigraph(Source * s) +{ + uchar c; + + while (s->inp + 2 >= s->inl && fillbuf(s) != EOF); + ; + if (s->inp[1] != '?') + return 0; + c = 0; + switch (s->inp[2]) + { + case '=': + c = '#'; + break; + case '(': + c = '['; + break; + case '/': + c = '\\'; + break; + case ')': + c = ']'; + break; + case '\'': + c = '^'; + break; + case '<': + c = '{'; + break; + case '!': + c = '|'; + break; + case '>': + c = '}'; + break; + case '-': + c = '~'; + break; + } + if (c) + { + *s->inp = c; + memmove(s->inp + 1, s->inp + 3, s->inl - s->inp + 2); + s->inl -= 2; + } + return c; +} + +int + foldline(Source * s) +{ + int n = 1; + + while (s->inp + 2 >= s->inl && fillbuf(s) != EOF); + + /* skip DOS line ends */ + if (((s->inp[n] == '\r') && (s->inp[n+1] == '\n')) || + ((s->inp[n] == '\n') && (s->inp[n+1] == '\r'))) + n++; + + if ((s->inp[n] == '\n') || (s->inp[n] == '\r')) + { + memmove(s->inp, s->inp + n + 1, s->inl - s->inp + n + 2); + s->inl -= n + 1; + return 1; + } + return 0; +} + +int + fillbuf(Source * s) +{ + int n; + + if (s->fd < 0 || (n = read(s->fd, (char *) s->inl, INS / 8)) <= 0) + n = 0; + s->inl += n; + s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOB; + if (n == 0) + { + s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOFC; + return EOF; + } + return 0; +} + +/* + * Push down to new source of characters. + * If fd>0 and str==NULL, then from a file `name'; + * if fd==-1 and str, then from the string. + */ +Source * + setsource(char *name, int path, int fd, char *str, int wrap) +{ + Source *s = new(Source); + int len; + + s->line = 1; + s->lineinc = 0; + s->fd = fd; + s->filename = name; + s->next = cursource; + s->ifdepth = 0; + s->pathdepth = path; + s->wrap = wrap; + + cursource = s; + + if (s->wrap) + genwrap(0); + + /* slop at right for EOB */ + if (str) + { + len = strlen(str); + s->inb = domalloc(len + 4); + s->inp = s->inb; + strncpy((char *) s->inp, str, len); + } + else + { + s->inb = domalloc(INS + 4); + s->inp = s->inb; + len = 0; + } + s->inl = s->inp + len; + s->inl[0] = s->inl[1] = EOB; + + return s; +} + +void + unsetsource(void) +{ + Source *s = cursource; + + if (s->wrap) + genwrap(1); + + if (s->fd >= 0) + { + close(s->fd); + dofree(s->inb); + } + cursource = s->next; + dofree(s); +} |