| /* |
| * (c) Thomas Pornin 1999, 2000 |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * 4. The name of the authors may not be used to endorse or promote |
| * products derived from this software without specific prior written |
| * permission. |
| * |
| * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR |
| * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT |
| * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
| * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
| * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
| * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
| * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * |
| */ |
| |
| /*vb*/ |
| #ifdef HAVE_MISRA |
| extern int misracheck; |
| void misra(int,...); |
| void misra_neu(int, int, int, int, ...); |
| #endif |
| |
| #include "tune.h" |
| #ifdef UCPP_MMAP |
| #ifndef _POSIX_SOURCE |
| #define _POSIX_SOURCE 1 |
| #endif |
| #endif |
| #include <stdio.h> |
| #include <string.h> |
| #include <stddef.h> |
| #include <limits.h> |
| #include <ctype.h> |
| #include "ucppi.h" |
| #include "mem.h" |
| #ifdef UCPP_MMAP |
| #include <unistd.h> |
| #include <sys/types.h> |
| #include <sys/mman.h> |
| #endif |
| |
| /* |
| * Character classes for description of the automaton. |
| * The characters used for representing classes should not appear |
| * explicitely in an automaton rule. |
| */ |
| #define SPC ' ' /* whitespace characters */ |
| #define ALP 'Z' /* A-Z, a-z, _ */ |
| #define NUM '9' /* 0-9 */ |
| #define ANY 'Y' /* any character */ |
| #define VCH 'F' /* void character (for end of input) */ |
| |
| /* |
| * flags and macros to test those flags |
| * STO: the currently read string is a complete token |
| * PUT: the currently read character must be added to the string |
| * FRZ: the currently read character must be kept and read again |
| */ |
| #define MOD_MK 255 |
| #define noMOD(x) ((x) & 255) |
| #define STO(x) ((x) | 256) |
| #define ttSTO(x) ((x) & 256) |
| #define FRZ(x) ((x) | 512) |
| #define ttFRZ(x) ((x) & 512) |
| #define PUT(x) ((x) | 1024) |
| #define ttPUT(x) ((x) & 1024) |
| |
| /* order is important */ |
| enum { |
| S_START, S_SPACE, S_BANG, S_STRING, S_STRING2, S_COLON, |
| S_SHARP, S_PCT, S_PCT2, S_PCT3, S_AMPER, S_CHAR, S_CHAR2, S_STAR, |
| S_PLUS, S_MINUS, S_DOT, S_DOT2, S_SLASH, S_NUMBER, S_NUMBER2, S_LT, |
| S_LT2, S_EQ, S_GT, S_GT2, S_CIRC, S_PIPE, S_TILDE, S_BACKSLASH, |
| S_COMMENT, S_COMMENT2, S_COMMENT3, S_COMMENT4, S_COMMENT5, |
| S_NAME, S_NAME_BS, S_LCHAR, |
| MSTATE, |
| S_ILL, S_DDOT, S_DDSHARP, S_BS, S_ROGUE_BS, S_BEHEAD, S_DECAY, |
| S_TRUNC, S_TRUNCC, S_OUCH |
| }; |
| |
| #define CMT(x) ((x) >= S_COMMENT && (x) <= S_COMMENT5) |
| |
| #define CMCR 2 |
| |
| /* |
| * This is the description of the automaton. It is not used "as is" |
| * but copied at execution time into a table. |
| * |
| * To my utmost displeasure, there are a few hacks in read_token() |
| * (which uses the transformed automaton) about the special handling |
| * of slashes, sharps, and the letter L. |
| */ |
| static struct machine_state { |
| int state; |
| unsigned char input[CMCR]; |
| int new_state; |
| } cppms[] = { |
| /* S_START is the generic beginning state */ |
| { S_START, { ANY }, S_ILL }, |
| #ifdef SEMPER_FIDELIS |
| { S_START, { SPC }, PUT(S_SPACE) }, |
| #else |
| { S_START, { SPC }, S_SPACE }, |
| #endif |
| { S_START, { '\n' }, STO(NEWLINE) }, |
| { S_START, { '!' }, S_BANG }, |
| { S_START, { '"' }, PUT(S_STRING) }, |
| { S_START, { '#' }, S_SHARP }, |
| { S_START, { '%' }, S_PCT }, |
| { S_START, { '&' }, S_AMPER }, |
| { S_START, { '\'' }, PUT(S_CHAR) }, |
| { S_START, { '(' }, STO(LPAR) }, |
| { S_START, { ')' }, STO(RPAR) }, |
| { S_START, { '*' }, S_STAR }, |
| { S_START, { '+' }, S_PLUS }, |
| { S_START, { ',' }, STO(COMMA) }, |
| { S_START, { '-' }, S_MINUS }, |
| { S_START, { '.' }, PUT(S_DOT) }, |
| #ifdef SEMPER_FIDELIS |
| { S_START, { '/' }, PUT(S_SLASH) }, |
| #else |
| { S_START, { '/' }, S_SLASH }, |
| #endif |
| { S_START, { NUM }, PUT(S_NUMBER) }, |
| #ifdef HAVE_ECPP |
| { S_START, { '@' }, S_COLON }, |
| { S_START, { ':' }, PUT(S_NAME) }, |
| #else |
| { S_START, { ':' }, S_COLON }, |
| #endif |
| { S_START, { ';' }, STO(SEMIC) }, |
| { S_START, { '<' }, S_LT }, |
| { S_START, { '=' }, S_EQ }, |
| { S_START, { '>' }, S_GT }, |
| { S_START, { '?' }, STO(QUEST) }, |
| { S_START, { ALP }, PUT(S_NAME) }, |
| { S_START, { 'L' }, PUT(S_LCHAR) }, |
| { S_START, { '[' }, STO(LBRK) }, |
| { S_START, { ']' }, STO(RBRK) }, |
| { S_START, { '^' }, S_CIRC }, |
| { S_START, { '{' }, STO(LBRA) }, |
| { S_START, { '|' }, S_PIPE }, |
| { S_START, { '}' }, STO(RBRA) }, |
| { S_START, { '~' }, S_TILDE }, |
| { S_START, { '\\' }, S_BACKSLASH }, |
| |
| /* after a space */ |
| { S_SPACE, { ANY }, FRZ(STO(NONE)) }, |
| #ifdef SEMPER_FIDELIS |
| { S_SPACE, { SPC }, PUT(S_SPACE) }, |
| #else |
| { S_SPACE, { SPC }, S_SPACE }, |
| #endif |
| |
| /* after a ! */ |
| { S_BANG, { ANY }, FRZ(STO(LNOT)) }, |
| { S_BANG, { '=' }, STO(NEQ) }, |
| |
| /* after a " */ |
| { S_STRING, { ANY }, PUT(S_STRING) }, |
| { S_STRING, { VCH }, FRZ(S_TRUNC) }, |
| { S_STRING, { '\n' }, FRZ(S_BEHEAD) }, |
| { S_STRING, { '\\' }, PUT(S_STRING2) }, |
| { S_STRING, { '"' }, PUT(STO(STRING)) }, |
| |
| { S_STRING2, { ANY }, PUT(S_STRING) }, |
| { S_STRING2, { VCH }, FRZ(S_TRUNC) }, |
| |
| /* after a # */ |
| { S_SHARP, { ANY }, FRZ(STO(SHARP)) }, |
| { S_SHARP, { '#' }, STO(DSHARP) }, |
| |
| /* after a : */ |
| { S_COLON, { ANY }, FRZ(STO(COLON)) }, |
| { S_COLON, { '>' }, STO(DIG_RBRK) }, |
| |
| /* after a % */ |
| { S_PCT, { ANY }, FRZ(STO(PCT)) }, |
| { S_PCT, { '=' }, STO(ASPCT) }, |
| { S_PCT, { '>' }, STO(DIG_RBRA) }, |
| { S_PCT, { ':' }, S_PCT2 }, |
| |
| /* after a %: */ |
| { S_PCT2, { ANY }, FRZ(STO(DIG_SHARP)) }, |
| { S_PCT2, { '%' }, S_PCT3 }, |
| |
| /* after a %:% */ |
| { S_PCT3, { ANY }, FRZ(S_DDSHARP) }, |
| { S_PCT3, { ':' }, STO(DIG_DSHARP) }, |
| |
| /* after a & */ |
| { S_AMPER, { ANY }, FRZ(STO(AND)) }, |
| { S_AMPER, { '=' }, STO(ASAND) }, |
| { S_AMPER, { '&' }, STO(LAND) }, |
| |
| /* after a ' */ |
| { S_CHAR, { ANY }, PUT(S_CHAR) }, |
| { S_CHAR, { VCH }, FRZ(S_TRUNC) }, |
| { S_CHAR, { '\'' }, PUT(STO(CHAR)) }, |
| { S_CHAR, { '\\' }, PUT(S_CHAR2) }, |
| |
| /* after a \ in a character constant |
| useful only for '\'' */ |
| { S_CHAR2, { ANY }, PUT(S_CHAR) }, |
| { S_CHAR2, { VCH }, FRZ(S_TRUNC) }, |
| |
| /* after a * */ |
| { S_STAR, { ANY }, FRZ(STO(STAR)) }, |
| { S_STAR, { '=' }, STO(ASSTAR) }, |
| |
| /* after a + */ |
| { S_PLUS, { ANY }, FRZ(STO(PLUS)) }, |
| { S_PLUS, { '+' }, STO(PPLUS) }, |
| { S_PLUS, { '=' }, STO(ASPLUS) }, |
| |
| /* after a - */ |
| { S_MINUS, { ANY }, FRZ(STO(MINUS)) }, |
| { S_MINUS, { '-' }, STO(MMINUS) }, |
| { S_MINUS, { '=' }, STO(ASMINUS) }, |
| { S_MINUS, { '>' }, STO(ARROW) }, |
| |
| /* after a . */ |
| { S_DOT, { ANY }, FRZ(STO(DOT)) }, |
| { S_DOT, { NUM }, PUT(S_NUMBER) }, |
| { S_DOT, { '.' }, S_DOT2 }, |
| |
| /* after .. */ |
| { S_DOT2, { ANY }, FRZ(S_DDOT) }, |
| { S_DOT2, { '.' }, STO(MDOTS) }, |
| |
| /* after a / */ |
| { S_SLASH, { ANY }, FRZ(STO(SLASH)) }, |
| { S_SLASH, { '=' }, STO(ASSLASH) }, |
| #ifdef SEMPER_FIDELIS |
| { S_SLASH, { '*' }, PUT(S_COMMENT) }, |
| { S_SLASH, { '/' }, PUT(S_COMMENT5) }, |
| #else |
| { S_SLASH, { '*' }, S_COMMENT }, |
| { S_SLASH, { '/' }, S_COMMENT5 }, |
| #endif |
| /* |
| * There is a little hack in read_token() to disable |
| * this last rule, if C++ (C99) comments are not enabled. |
| */ |
| |
| /* after a number */ |
| { S_NUMBER, { ANY }, FRZ(STO(NUMBER)) }, |
| { S_NUMBER, { ALP, NUM }, PUT(S_NUMBER) }, |
| { S_NUMBER, { '.' }, PUT(S_NUMBER) }, |
| { S_NUMBER, { 'E', 'e' }, PUT(S_NUMBER2) }, |
| { S_NUMBER, { 'P', 'p' }, PUT(S_NUMBER2) }, |
| |
| { S_NUMBER2, { ANY }, FRZ(STO(NUMBER)) }, |
| { S_NUMBER2, { ALP, NUM }, PUT(S_NUMBER) }, |
| { S_NUMBER2, { '+', '-' }, PUT(S_NUMBER) }, |
| |
| /* after a < */ |
| { S_LT, { ANY }, FRZ(STO(LT)) }, |
| { S_LT, { '=' }, STO(LEQ) }, |
| { S_LT, { '<' }, S_LT2 }, |
| { S_LT, { ':' }, STO(DIG_LBRK) }, |
| { S_LT, { '%' }, STO(DIG_LBRA) }, |
| |
| { S_LT2, { ANY }, FRZ(STO(LSH)) }, |
| { S_LT2, { '=' }, STO(ASLSH) }, |
| |
| /* after a > */ |
| { S_GT, { ANY }, FRZ(STO(GT)) }, |
| { S_GT, { '=' }, STO(GEQ) }, |
| { S_GT, { '>' }, S_GT2 }, |
| |
| { S_GT2, { ANY }, FRZ(STO(RSH)) }, |
| { S_GT2, { '=' }, STO(ASRSH) }, |
| |
| /* after a = */ |
| { S_EQ, { ANY }, FRZ(STO(ASGN)) }, |
| { S_EQ, { '=' }, STO(SAME) }, |
| #ifdef CAST_OP |
| { S_EQ, { '>' }, STO(CAST) }, |
| #endif |
| |
| /* after a \ */ |
| { S_BACKSLASH, { ANY }, FRZ(S_BS) }, |
| { S_BACKSLASH, { 'U', 'u' }, FRZ(S_NAME_BS) }, |
| |
| /* after a letter */ |
| { S_NAME, { ANY }, FRZ(STO(NAME)) }, |
| { S_NAME, { ALP, NUM }, PUT(S_NAME) }, |
| { S_NAME, { '\\' }, S_NAME_BS }, |
| #ifdef HAVE_ECPP |
| { S_NAME, { ':' }, PUT(S_NAME) }, |
| #endif |
| |
| /* after a \ in an identifier */ |
| { S_NAME_BS, { ANY }, FRZ(S_ROGUE_BS) }, |
| { S_NAME_BS, { 'u', 'U' }, PUT(S_NAME) }, |
| |
| /* after a L */ |
| { S_LCHAR, { ANY }, FRZ(S_NAME) }, |
| { S_LCHAR, { '"' }, PUT(S_STRING) }, |
| { S_LCHAR, { '\'' }, PUT(S_CHAR) }, |
| |
| /* after a ^ */ |
| { S_CIRC, { ANY }, FRZ(STO(CIRC)) }, |
| { S_CIRC, { '=' }, STO(ASCIRC) }, |
| |
| /* after a | */ |
| { S_PIPE, { ANY }, FRZ(STO(OR)) }, |
| { S_PIPE, { '=' }, STO(ASOR) }, |
| { S_PIPE, { '|' }, STO(LOR) }, |
| |
| /* after a ~ */ |
| { S_TILDE, { ANY }, FRZ(STO(NOT)) }, |
| { S_TILDE, { '=' }, STO(ASNOT) }, |
| |
| /* after a / and * */ |
| #ifdef SEMPER_FIDELIS |
| { S_COMMENT, { ANY }, PUT(S_COMMENT) }, |
| { S_COMMENT, { VCH }, FRZ(S_TRUNCC) }, |
| { S_COMMENT, { '*' }, PUT(S_COMMENT2) }, |
| |
| { S_COMMENT2, { ANY }, FRZ(S_COMMENT) }, |
| { S_COMMENT2, { VCH }, FRZ(S_TRUNCC) }, |
| { S_COMMENT2, { '*' }, PUT(S_COMMENT2) }, |
| { S_COMMENT2, { '/' }, STO(PUT(COMMENT)) }, |
| |
| { S_COMMENT5, { ANY }, PUT(S_COMMENT5) }, |
| { S_COMMENT5, { VCH }, FRZ(S_DECAY) }, |
| { S_COMMENT5, { '\n' }, FRZ(STO(COMMENT)) }, |
| #else |
| { S_COMMENT, { ANY }, S_COMMENT }, |
| { S_COMMENT, { VCH }, FRZ(S_TRUNCC) }, |
| { S_COMMENT, { '*' }, S_COMMENT2 }, |
| |
| { S_COMMENT2, { ANY }, FRZ(S_COMMENT) }, |
| { S_COMMENT2, { VCH }, FRZ(S_TRUNCC) }, |
| { S_COMMENT2, { '*' }, S_COMMENT2 }, |
| { S_COMMENT2, { '/' }, STO(COMMENT) }, |
| |
| { S_COMMENT5, { ANY }, S_COMMENT5 }, |
| { S_COMMENT5, { VCH }, FRZ(S_DECAY) }, |
| { S_COMMENT5, { '\n' }, FRZ(STO(COMMENT)) }, |
| #endif |
| |
| /* dummy end of machine description */ |
| { 0, { 0 }, 0 } |
| }; |
| |
| /* |
| * cppm is the table used to store the automaton: if we are in state s |
| * and we read character c, we apply the action cppm[s][c] (jumping to |
| * another state, or emitting a token). |
| * cppm_vch is the table for the special virtual character "end of input" |
| */ |
| static int cppm[MSTATE][MAX_CHAR_VAL]; |
| static int cppm_vch[MSTATE]; |
| |
| /* |
| * init_cppm() fills cppm[][] with the information stored in cppms[]. |
| * It must be called before beginning the lexing process. |
| */ |
| void init_cppm(void) |
| { |
| int i, j, k, c; |
| static unsigned char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; |
| static unsigned char lower[] = "abcdefghijklmnopqrstuvwxyz"; |
| unsigned char *cp; |
| |
| for (i = 0; i < MSTATE; i ++) { |
| for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[i][j] = S_OUCH; |
| cppm_vch[i] = S_OUCH; |
| } |
| for (i = 0; cppms[i].input[0]; i ++) for (k = 0; k < CMCR; k ++) { |
| int s = cppms[i].state; |
| int ns = cppms[i].new_state; |
| |
| switch (c = cppms[i].input[k]) { |
| case 0: |
| break; |
| case SPC: |
| /* see space_char() also */ |
| cppm[s][' '] = ns; |
| cppm[s]['\t'] = ns; |
| cppm[s]['\v'] = ns; |
| cppm[s]['\f'] = ns; |
| #ifdef UNBREAKABLE_SPACE |
| if (MAX_CHAR_VAL > UNBREAKABLE_SPACE) |
| cppm[s][UNBREAKABLE_SPACE] = ns; |
| #endif |
| break; |
| case ALP: |
| for (cp = upper; *cp; cp ++) cppm[s][(int)*cp] = ns; |
| for (cp = lower; *cp; cp ++) cppm[s][(int)*cp] = ns; |
| cppm[s]['_'] = ns; |
| break; |
| case NUM: |
| for (j = '0'; j <= '9'; j ++) cppm[s][j] = ns; |
| break; |
| case ANY: |
| for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[s][j] = ns; |
| cppm_vch[s] = ns; |
| break; |
| case VCH: |
| cppm_vch[s] = ns; |
| break; |
| default: |
| cppm[s][c] = ns; |
| break; |
| } |
| } |
| } |
| |
| int space_char(int c) |
| { |
| if (c == ' ' || c == '\t' || c == '\v' || c == '\f' |
| #ifdef UNBREAKABLE_SPACE |
| || c == UNBREAKABLE_SPACE |
| #endif |
| ) return 1; |
| return 0; |
| } |
| |
| #ifndef NO_UCPP_BUF |
| /* |
| * our output buffer is full, flush it |
| */ |
| void flush_output(struct lexer_state *ls) |
| { |
| size_t x = ls->sbuf, y = 0, z; |
| |
| if (ls->sbuf == 0) return; |
| do { |
| z = fwrite(ls->output_buf + y, 1, x, ls->output); |
| x -= z; |
| y += z; |
| } while (z && x > 0); |
| if (!y) { |
| error(ls->line, "could not flush output (disk full ?)"); |
| die(); |
| } |
| ls->sbuf = 0; |
| } |
| #endif |
| |
| /* |
| * Output one character; flush the buffer if needed. |
| * This function should not be called, except by put_char(). |
| */ |
| static inline void write_char(struct lexer_state *ls, unsigned char c) |
| { |
| #ifndef NO_UCPP_BUF |
| ls->output_buf[ls->sbuf ++] = c; |
| if (ls->sbuf == OUTPUT_BUF_MEMG) flush_output(ls); |
| #else |
| if (putc((int)c, ls->output) == EOF) { |
| error(ls->line, "output write error (disk full ?)"); |
| die(); |
| } |
| #endif |
| if (c == '\n') { |
| ls->oline ++; |
| } |
| } |
| |
| /* |
| * schedule a character for output |
| */ |
| void put_char(struct lexer_state *ls, unsigned char c) |
| { |
| if (ls->flags & KEEP_OUTPUT) write_char(ls, c); |
| } |
| |
| /* |
| * get next raw input character |
| */ |
| static inline int read_char(struct lexer_state *ls) |
| { |
| unsigned char c; |
| |
| if (!ls->input) { |
| return ((ls->pbuf ++) < ls->ebuf) ? |
| ls->input_string[ls->pbuf - 1] : -1; |
| } |
| while (1) { |
| #ifndef NO_UCPP_BUF |
| if (ls->pbuf == ls->ebuf) { |
| #ifdef UCPP_MMAP |
| if (ls->from_mmap) { |
| munmap((void *)ls->input_buf, ls->ebuf); |
| ls->from_mmap = 0; |
| ls->input_buf = ls->input_buf_sav; |
| } |
| #endif |
| ls->ebuf = fread(ls->input_buf, 1, |
| INPUT_BUF_MEMG, ls->input); |
| ls->pbuf = 0; |
| } |
| if (ls->ebuf == 0) return -1; |
| c = ls->input_buf[ls->pbuf ++]; |
| #else |
| int x = getc(ls->input); |
| |
| if (x == EOF) return -1; |
| c = x; |
| #endif |
| if (ls->flags & COPY_LINE) { |
| if (c == '\n') { |
| ls->copy_line[ls->cli] = 0; |
| ls->cli = 0; |
| } else if (ls->cli < (COPY_LINE_LENGTH - 1)) { |
| ls->copy_line[ls->cli ++] = c; |
| } |
| } |
| if (ls->macfile && c == '\n') { |
| ls->macfile = 0; |
| continue; |
| } |
| ls->macfile = 0; |
| if (c == '\r') { |
| /* |
| * We found a '\r'; we handle it as a newline |
| * and ignore the next newline. This should work |
| * with all combinations of Msdos, MacIntosh and |
| * Unix files on these three platforms. On other |
| * platforms, native file formats are always |
| * supported. |
| */ |
| ls->macfile = 1; |
| c = '\n'; |
| } |
| break; |
| } |
| /*vb*/ |
| #ifdef HAVE_MISRA |
| if(misracheck){ |
| if(!isspace((unsigned char)c)&& |
| (c<'A'||c>'Z')&& |
| (c<'a'||c>'z')&& |
| (c<'0'||c>'9')&& |
| c!='!'&&c!='\"'&&c!='#'&&c!='%'&&c!='&'&&c!='\''&&c!='('&&c!=')'&& |
| c!='*'&&c!='+'&&c!=','&&c!='-'&&c!='.'&&c!='/'&&c!=':'&& |
| c!=';'&&c!='<'&&c!='='&&c!='>'&&c!='?'&&c!='['&&c!=']'&& |
| c!='^'&&c!='_'&&c!='{'&&c!='|'&&c!='}'&&c!='~'&&c!='\\' |
| ) |
| misra_neu(5,4,1,ls->line,c,' '); |
| } |
| #endif |
| return c; |
| } |
| |
| /* |
| * next_fifo_char(), char_lka1() and char_lka2() give a two character |
| * look-ahead on the input stream; this is needed for trigraphs |
| */ |
| static inline int next_fifo_char(struct lexer_state *ls) |
| { |
| int c; |
| |
| if (ls->nlka != 0) { |
| c = ls->lka[0]; |
| ls->lka[0] = ls->lka[1]; |
| ls->nlka --; |
| } else c = read_char(ls); |
| return c; |
| } |
| |
| static inline int char_lka1(struct lexer_state *ls) |
| { |
| if (ls->nlka == 0) { |
| ls->lka[0] = read_char(ls); |
| ls->nlka ++; |
| } |
| return ls->lka[0]; |
| } |
| |
| static inline int char_lka2(struct lexer_state *ls) |
| { |
| #ifdef AUDIT |
| if (ls->nlka == 0) ouch("always in motion future is"); |
| #endif |
| if (ls->nlka == 1) { |
| ls->lka[1] = read_char(ls); |
| ls->nlka ++; |
| } |
| return ls->lka[1]; |
| } |
| |
| static struct trigraph { |
| int old, new; |
| } trig[9] = { |
| { '=', '#' }, |
| { '/', '\\' }, |
| { '\'', '^' }, |
| { '(', '[' }, |
| { ')', ']' }, |
| { '!', '|' }, |
| { '<', '{' }, |
| { '>', '}' }, |
| { '-', '~' } |
| }; |
| |
| /* |
| * Returns the next character, after treatment of trigraphs and terminating |
| * backslashes. Return value is -1 if there is no more input. |
| */ |
| static inline int next_char(struct lexer_state *ls) |
| { |
| int c; |
| |
| if (!ls->discard) return ls->last; |
| ls->discard = 0; |
| do { |
| c = next_fifo_char(ls); |
| #ifdef HAVE_ECPP |
| if(c==':'){ |
| if(char_lka1(ls)==':'){ |
| next_fifo_char(ls); |
| }else{ |
| c='@'; |
| } |
| } |
| #endif |
| /* check trigraphs */ |
| if (c == '?' && char_lka1(ls) == '?' |
| && (ls->flags & HANDLE_TRIGRAPHS)) { |
| int i, d; |
| |
| d = char_lka2(ls); |
| for (i = 0; i < 9; i ++) if (d == trig[i].old) { |
| if (ls->flags & WARN_TRIGRAPHS) { |
| ls->count_trigraphs ++; |
| } |
| if (ls->flags & WARN_TRIGRAPHS_MORE) { |
| warning(ls->line, "trigraph ?""?%c " |
| "encountered", d); |
| /*vb*/ |
| #ifdef HAVE_MISRA |
| misra_neu(7,4,2,-1,d); |
| #endif |
| } |
| next_fifo_char(ls); |
| next_fifo_char(ls); |
| c = trig[i].new; |
| break; |
| } |
| } |
| if (c == '\\' && char_lka1(ls) == '\n') { |
| ls->line ++; |
| next_fifo_char(ls); |
| } else { |
| ls->last = c; |
| return c; |
| } |
| } while (1); |
| } |
| |
| /* |
| * wrapper for next_char(), to be called from outside |
| * (used by #error, #include directives) |
| */ |
| int grap_char(struct lexer_state *ls) |
| { |
| return next_char(ls); |
| } |
| |
| /* |
| * Discard the current character, so that the next call to next_char() |
| * will step into the input stream. |
| */ |
| void discard_char(struct lexer_state *ls) |
| { |
| #ifdef AUDIT |
| if (ls->discard) ouch("overcollecting garbage"); |
| #endif |
| ls->discard = 1; |
| ls->utf8 = 0; |
| if (ls->last == '\n') ls->line ++; |
| } |
| |
| /* |
| * Convert an UTF-8 encoded character to a Universal Character Name |
| * using \u (or \U when appropriate). |
| */ |
| static int utf8_to_string(unsigned char buf[], unsigned long utf8) |
| { |
| unsigned long val = 0; |
| static char hex[16] = "0123456789abcdef"; |
| |
| if (utf8 & 0x80UL) { |
| unsigned long x1, x2, x3, x4; |
| |
| x1 = (utf8 >> 24) & 0x7fUL; |
| x2 = (utf8 >> 16) & 0x7fUL; |
| x3 = (utf8 >> 8) & 0x7fUL; |
| x4 = (utf8) & 0x3fUL; |
| x1 &= 0x07UL; |
| if (x2 & 0x40UL) x2 &= 0x0fUL; |
| if (x3 & 0x40UL) x3 &= 0x1fUL; |
| val = x4 | (x3 << 6) | (x2 << 12) | (x1 << 16); |
| } else val = utf8; |
| if (val < 128) { |
| buf[0] = val; |
| buf[1] = 0; |
| return 1; |
| } else if (val < 0xffffUL) { |
| buf[0] = '\\'; |
| buf[1] = 'u'; |
| buf[2] = hex[(size_t)(val >> 12)]; |
| buf[3] = hex[(size_t)((val >> 8) & 0xfU)]; |
| buf[4] = hex[(size_t)((val >> 4) & 0xfU)]; |
| buf[5] = hex[(size_t)(val & 0xfU)]; |
| buf[6] = 0; |
| return 6; |
| } |
| buf[0] = '\\'; |
| buf[1] = 'U'; |
| buf[2] = '0'; |
| buf[3] = '0'; |
| buf[4] = hex[(size_t)(val >> 20)]; |
| buf[5] = hex[(size_t)((val >> 16) & 0xfU)]; |
| buf[6] = hex[(size_t)((val >> 12) & 0xfU)]; |
| buf[7] = hex[(size_t)((val >> 8) & 0xfU)]; |
| buf[8] = hex[(size_t)((val >> 4) & 0xfU)]; |
| buf[9] = hex[(size_t)(val & 0xfU)]; |
| buf[10] = 0; |
| return 10; |
| } |
| |
| /* |
| * Scan the identifier and put it in canonical form: |
| * -- tranform \U0000xxxx into \uxxxx |
| * -- inside \u and \U, make letters low case |
| * -- report (some) incorrect use of UCN |
| */ |
| static void canonize_id(struct lexer_state *ls, char *id) |
| { |
| char *c, *d; |
| |
| for (c = d = id; *c;) { |
| if (*c == '\\') { |
| int i; |
| |
| if (!*(c + 1)) goto canon_error; |
| if (*(c + 1) == 'U') { |
| for (i = 0; i < 8 && *(c + i + 2); i ++); |
| if (i != 8) goto canon_error; |
| *(d ++) = '\\'; |
| c += 2; |
| for (i = 0; i < 4 && *(c + i) == '0'; i ++); |
| if (i == 4) { |
| *(d ++) = 'u'; |
| c += 4; |
| } else { |
| *(d ++) = 'U'; |
| i = 8; |
| } |
| for (; i > 0; i --) { |
| switch (*c) { |
| case 'A': *(d ++) = 'a'; break; |
| case 'B': *(d ++) = 'b'; break; |
| case 'C': *(d ++) = 'c'; break; |
| case 'D': *(d ++) = 'd'; break; |
| case 'E': *(d ++) = 'e'; break; |
| case 'F': *(d ++) = 'f'; break; |
| default: *(d ++) = *c; break; |
| } |
| c ++; |
| } |
| } else if (*(c + 1) == 'u') { |
| for (i = 0; i < 4 && *(c + i + 2); i ++); |
| if (i != 4) goto canon_error; |
| *(d ++) = '\\'; |
| *(d ++) = 'u'; |
| c += 2; |
| for (; i > 0; i --) { |
| switch (*c) { |
| case 'A': *(d ++) = 'a'; break; |
| case 'B': *(d ++) = 'b'; break; |
| case 'C': *(d ++) = 'c'; break; |
| case 'D': *(d ++) = 'd'; break; |
| case 'E': *(d ++) = 'e'; break; |
| case 'F': *(d ++) = 'f'; break; |
| default: *(d ++) = *c; break; |
| } |
| c ++; |
| } |
| } else goto canon_error; |
| continue; |
| } |
| *(d ++) = *(c ++); |
| } |
| *d = 0; |
| return; |
| |
| canon_error: |
| for (; *c; *(d ++) = *(c ++)); |
| if (ls->flags & WARN_STANDARD) { |
| warning(ls->line, "malformed identifier with UCN: '%s'", id); |
| } |
| *d = 0; |
| } |
| |
| /* |
| * Run the automaton, in order to get the next token. |
| * This function should not be called, except by next_token() |
| * |
| * return value: 1 on error, 2 on end-of-file, 0 otherwise. |
| */ |
| static inline int read_token(struct lexer_state *ls) |
| { |
| int cstat = S_START, nstat; |
| size_t ltok = 0; |
| int c, outc = 0, ucn_in_id = 0; |
| int shift_state; |
| unsigned long utf8; |
| long l = ls->line; |
| |
| ls->ctok->line = l; |
| if (ls->pending_token) { |
| if ((ls->ctok->type = ls->pending_token) == BUNCH) { |
| ls->ctok->name[0] = '\\'; |
| ls->ctok->name[1] = 0; |
| } |
| ls->pending_token = 0; |
| return 0; |
| } |
| if (ls->flags & UTF8_SOURCE) { |
| utf8 = ls->utf8; |
| shift_state = 0; |
| } |
| if (!(ls->flags & LEXER) && (ls->flags & KEEP_OUTPUT)) |
| for (; ls->line > ls->oline;) put_char(ls, '\n'); |
| do { |
| c = next_char(ls); |
| if (c < 0) { |
| if ((ls->flags & UTF8_SOURCE) && shift_state) { |
| if (ls->flags & WARN_STANDARD) |
| warning(ls->line, "truncated UTF-8 " |
| "character"); |
| shift_state = 0; |
| utf8 = 0; |
| } |
| if (cstat == S_START) return 2; |
| nstat = cppm_vch[cstat]; |
| } else { |
| if (ls->flags & UTF8_SOURCE) { |
| if (shift_state) { |
| if ((c & 0xc0) != 0x80) { |
| if (ls->flags & WARN_STANDARD) |
| warning(ls->line, |
| "truncated " |
| "UTF-8 " |
| "character"); |
| shift_state = 0; |
| utf8 = 0; |
| c = '_'; |
| } else { |
| utf8 = (utf8 << 8) | c; |
| if (-- shift_state) { |
| ls->discard = 1; |
| continue; |
| } |
| c = '_'; |
| } |
| } else if ((c & 0xc0) == 0xc0) { |
| if ((c & 0x30) == 0x30) { |
| shift_state = 3; |
| } else if (c & 0x20) { |
| shift_state = 2; |
| } else { |
| shift_state = 1; |
| } |
| utf8 = c; |
| ls->discard = 1; |
| continue; |
| } else utf8 = 0; |
| } |
| nstat = cppm[cstat][c < MAX_CHAR_VAL ? c : 0]; |
| } |
| #ifdef AUDIT |
| if (nstat == S_OUCH) { |
| ouch("bad move..."); |
| } |
| #endif |
| /* |
| * disable C++-like comments |
| */ |
| if (nstat == S_COMMENT5 && !(ls->flags & CPLUSPLUS_COMMENTS)) |
| nstat = FRZ(STO(SLASH)); |
| |
| if (noMOD(nstat) >= MSTATE && !ttSTO(nstat)) |
| switch (noMOD(nstat)) { |
| case S_ILL: |
| if (ls->flags & CCHARSET) { |
| error(ls->line, "illegal character '%c'", c); |
| return 1; |
| } |
| nstat = PUT(STO(BUNCH)); |
| break; |
| case S_BS: |
| ls->ctok->name[0] = '\\'; |
| ltok ++; |
| nstat = FRZ(STO(BUNCH)); |
| if (!(ls->flags & LEXER)) put_char(ls, '\\'); |
| break; |
| case S_ROGUE_BS: |
| ls->pending_token = BUNCH; |
| nstat = FRZ(STO(NAME)); |
| break; |
| case S_DDOT: |
| ls->pending_token = DOT; |
| nstat = FRZ(STO(DOT)); |
| break; |
| case S_DDSHARP: |
| ls->pending_token = PCT; |
| nstat = FRZ(STO(DIG_SHARP)); |
| break; |
| case S_BEHEAD: |
| error(l, "unfinished string at end of line"); |
| return 1; |
| case S_DECAY: |
| warning(l, "unterminated // comment"); |
| nstat = FRZ(STO(COMMENT)); |
| break; |
| case S_TRUNC: |
| error(l, "truncated token"); |
| return 1; |
| case S_TRUNCC: |
| error(l, "truncated comment"); |
| return 1; |
| #ifdef AUDIT |
| case S_OUCH: |
| ouch("machine went out of control"); |
| break; |
| #endif |
| } |
| if (!ttFRZ(nstat)) { |
| discard_char(ls); |
| if (!(ls->flags & LEXER) && ls->condcomp) { |
| int z = ttSTO(nstat) ? S_ILL : noMOD(nstat); |
| |
| if (cstat == S_NAME || z == S_NAME |
| || ((CMT(cstat) || CMT(z)) |
| && (ls->flags & DISCARD_COMMENTS))) { |
| outc = 0; |
| } else if (z == S_LCHAR || z == S_SLASH |
| || (z == S_SHARP && ls->ltwnl) |
| || (z == S_PCT && ls->ltwnl) |
| || (z == S_BACKSLASH)) { |
| outc = c; |
| } else if (z == S_PCT2 && ls->ltwnl) { |
| outc = -1; |
| } else if (z == S_PCT3 && ls->ltwnl) { |
| /* we have %:% but this still might |
| not be a %:%: */ |
| outc = -2; |
| } else { |
| if (outc < 0) { |
| put_char(ls, '%'); |
| put_char(ls, ':'); |
| if (outc == -2) |
| put_char(ls, '%'); |
| outc = 0; |
| } else if (outc) { |
| put_char(ls, outc); |
| outc = 0; |
| } |
| put_char(ls, c); |
| } |
| } |
| } else if (outc == '/' && !(ls->flags & LEXER) |
| && ls->condcomp) { |
| /* this is a hack: we need to dump a pending slash */ |
| put_char(ls, outc); |
| outc = 0; |
| } |
| if (ttPUT(nstat)) { |
| if (cstat == S_NAME_BS) { |
| ucn_in_id = 1; |
| wan(ls->ctok->name, ltok, '\\', ls->tknl); |
| } |
| if ((ls->flags & UTF8_SOURCE) && utf8) { |
| unsigned char buf[11]; |
| int i, j; |
| |
| for (i = 0, j = utf8_to_string(buf, utf8); |
| i < j; i ++) |
| wan(ls->ctok->name, ltok, buf[i], |
| ls->tknl); |
| /* if (j > 1) ucn_in_id = 1; */ |
| } else wan(ls->ctok->name, ltok, |
| (unsigned char)c, ls->tknl); |
| } |
| if (ttSTO(nstat)) { |
| if (S_TOKEN(noMOD(nstat))) { |
| wan(ls->ctok->name, ltok, |
| (unsigned char)0, ls->tknl); |
| } |
| ls->ctok->type = noMOD(nstat); |
| break; |
| } |
| cstat = noMOD(nstat); |
| } while (1); |
| if (!(ls->flags & LEXER) && (ls->flags & DISCARD_COMMENTS) |
| && ls->ctok->type == COMMENT) put_char(ls, ' '); |
| if (ucn_in_id && ls->ctok->type == NAME) |
| canonize_id(ls, ls->ctok->name); |
| return 0; |
| } |
| |
| /* |
| * fills ls->ctok with the next token |
| */ |
| int next_token(struct lexer_state *ls) |
| { |
| if (ls->flags & READ_AGAIN) { |
| ls->flags &= ~READ_AGAIN; |
| if (!(ls->flags & LEXER)) { |
| char *c = S_TOKEN(ls->ctok->type) ? |
| ls->ctok->name : token_name(ls->ctok); |
| if (ls->ctok->type == OPT_NONE) { |
| ls->ctok->type = NONE; |
| #ifdef SEMPER_FIDELIS |
| ls->ctok->name[0] = ' '; |
| ls->ctok->name[1] = 0; |
| #endif |
| put_char(ls, ' '); |
| } else if (ls->ctok->type != NAME && |
| !(ls->ltwnl && (ls->ctok->type == SHARP |
| || ls->ctok->type == DIG_SHARP))) |
| for (; *c; c ++) put_char(ls, *c); |
| } |
| return 0; |
| } |
| return read_token(ls); |
| } |