ucpp/lexer.c - vbcc - Gitiles

 /*
  * (c) Thomas Pornin 1999, 2000
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. The name of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */

 /*vb*/
 #ifdef HAVE_MISRA
 extern int misracheck;
 void misra(int,...);
 void misra_neu(int, int, int, int, ...);
 #endif

 #include "tune.h"
 #ifdef UCPP_MMAP
 #ifndef _POSIX_SOURCE
 #define _POSIX_SOURCE	1
 #endif
 #endif
 #include <stdio.h>
 #include <string.h>
 #include <stddef.h>
 #include <limits.h>
 #include <ctype.h>
 #include "ucppi.h"
 #include "mem.h"
 #ifdef UCPP_MMAP
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/mman.h>
 #endif

 /*
  * Character classes for description of the automaton.
  * The characters used for representing classes should not appear
  * explicitely in an automaton rule.
  */
 #define SPC	' '	/* whitespace characters */
 #define ALP	'Z'	/* A-Z, a-z, _ */
 #define NUM	'9'	/* 0-9 */
 #define ANY	'Y'	/* any character */
 #define VCH	'F'	/* void character (for end of input) */

 /*
  * flags and macros to test those flags
  * STO: the currently read string is a complete token
  * PUT: the currently read character must be added to the string
  * FRZ: the currently read character must be kept and read again
  */
 #define MOD_MK		255
 #define noMOD(x)	((x) & 255)
 #define STO(x)		((x) | 256)
 #define ttSTO(x)	((x) & 256)
 #define FRZ(x)		((x) | 512)
 #define ttFRZ(x)	((x) & 512)
 #define PUT(x)		((x) | 1024)
 #define ttPUT(x)	((x) & 1024)

 /* order is important */
 enum {
 	S_START, S_SPACE, S_BANG, S_STRING, S_STRING2, S_COLON,
 	S_SHARP, S_PCT, S_PCT2, S_PCT3, S_AMPER, S_CHAR, S_CHAR2, S_STAR,
 	S_PLUS, S_MINUS, S_DOT, S_DOT2, S_SLASH, S_NUMBER, S_NUMBER2, S_LT,
 	S_LT2, S_EQ, S_GT, S_GT2, S_CIRC, S_PIPE, S_TILDE, S_BACKSLASH,
 	S_COMMENT, S_COMMENT2, S_COMMENT3, S_COMMENT4, S_COMMENT5,
 	S_NAME, S_NAME_BS, S_LCHAR,
 	MSTATE,
 	S_ILL, S_DDOT, S_DDSHARP, S_BS, S_ROGUE_BS, S_BEHEAD, S_DECAY,
 	S_TRUNC, S_TRUNCC, S_OUCH
 };

 #define CMT(x)		((x) >= S_COMMENT && (x) <= S_COMMENT5)

 #define CMCR	2

 /*
  * This is the description of the automaton. It is not used "as is"
  * but copied at execution time into a table.
  *
  * To my utmost displeasure, there are a few hacks in read_token()
  * (which uses the transformed automaton) about the special handling
  * of slashes, sharps, and the letter L.
  */
 static struct machine_state {
 	int state;
 	unsigned char input[CMCR];
 	int new_state;
 } cppms[] = {
 	/* S_START is the generic beginning state */
 	{ S_START,	{ ANY },	S_ILL			},
 #ifdef SEMPER_FIDELIS
 	{ S_START,	{ SPC },	PUT(S_SPACE)		},
 #else
 	{ S_START,	{ SPC },	S_SPACE			},
 #endif
 	{ S_START,	{ '\n' },	STO(NEWLINE)		},
 	{ S_START,	{ '!' },	S_BANG			},
 	{ S_START,	{ '"' },	PUT(S_STRING)		},
 	{ S_START,	{ '#' },	S_SHARP			},
 	{ S_START,	{ '%' },	S_PCT			},
 	{ S_START,	{ '&' },	S_AMPER			},
 	{ S_START,	{ '\'' },	PUT(S_CHAR)		},
 	{ S_START,	{ '(' },	STO(LPAR)		},
 	{ S_START,	{ ')' },	STO(RPAR)		},
 	{ S_START,	{ '*' },	S_STAR			},
 	{ S_START,	{ '+' },	S_PLUS			},
 	{ S_START,	{ ',' },	STO(COMMA)		},
 	{ S_START,	{ '-' },	S_MINUS			},
 	{ S_START,	{ '.' },	PUT(S_DOT)		},
 #ifdef SEMPER_FIDELIS
 	{ S_START,	{ '/' },	PUT(S_SLASH)		},
 #else
 	{ S_START,	{ '/' },	S_SLASH			},
 #endif
 	{ S_START,	{ NUM },	PUT(S_NUMBER)		},
 #ifdef HAVE_ECPP
 	{ S_START,	{ '@' },	S_COLON			},
 	{ S_START,	{ ':' },	PUT(S_NAME)		},
 #else
 	{ S_START,	{ ':' },	S_COLON			},
 #endif
 	{ S_START,	{ ';' },	STO(SEMIC)		},
 	{ S_START,	{ '<' },	S_LT			},
 	{ S_START,	{ '=' },	S_EQ			},
 	{ S_START,	{ '>' },	S_GT			},
 	{ S_START,	{ '?' },	STO(QUEST)		},
 	{ S_START,	{ ALP },	PUT(S_NAME)		},
 	{ S_START,	{ 'L' },	PUT(S_LCHAR)		},
 	{ S_START,	{ '[' },	STO(LBRK)		},
 	{ S_START,	{ ']' },	STO(RBRK)		},
 	{ S_START,	{ '^' },	S_CIRC			},
 	{ S_START,	{ '{' },	STO(LBRA)		},
 	{ S_START,	{ '|' },	S_PIPE			},
 	{ S_START,	{ '}' },	STO(RBRA)		},
 	{ S_START,	{ '~' },	S_TILDE			},
 	{ S_START,	{ '\\' },	S_BACKSLASH		},

 	/* after a space */
 	{ S_SPACE,	{ ANY },	FRZ(STO(NONE))		},
 #ifdef SEMPER_FIDELIS
 	{ S_SPACE,	{ SPC },	PUT(S_SPACE)		},
 #else
 	{ S_SPACE,	{ SPC },	S_SPACE			},
 #endif

 	/* after a ! */
 	{ S_BANG,	{ ANY },	FRZ(STO(LNOT))		},
 	{ S_BANG,	{ '=' },	STO(NEQ)		},

 	/* after a " */
 	{ S_STRING,	{ ANY },	PUT(S_STRING)		},
 	{ S_STRING,	{ VCH },	FRZ(S_TRUNC)		},
 	{ S_STRING,	{ '\n' },	FRZ(S_BEHEAD)		},
 	{ S_STRING,	{ '\\' },	PUT(S_STRING2)		},
 	{ S_STRING,	{ '"' },	PUT(STO(STRING))	},

 	{ S_STRING2,	{ ANY },	PUT(S_STRING)		},
 	{ S_STRING2,	{ VCH },	FRZ(S_TRUNC)		},

 	/* after a # */
 	{ S_SHARP,	{ ANY },	FRZ(STO(SHARP))		},
 	{ S_SHARP,	{ '#' },	STO(DSHARP)		},

 	/* after a : */
 	{ S_COLON,	{ ANY },	FRZ(STO(COLON))		},
 	{ S_COLON,	{ '>' },	STO(DIG_RBRK)		},

 	/* after a % */
 	{ S_PCT,	{ ANY },	FRZ(STO(PCT))		},
 	{ S_PCT,	{ '=' },	STO(ASPCT)		},
 	{ S_PCT,	{ '>' },	STO(DIG_RBRA)		},
 	{ S_PCT,	{ ':' },	S_PCT2			},

 	/* after a %: */
 	{ S_PCT2,	{ ANY },	FRZ(STO(DIG_SHARP))	},
 	{ S_PCT2,	{ '%' },	S_PCT3			},

 	/* after a %:% */
 	{ S_PCT3,	{ ANY },	FRZ(S_DDSHARP)		},
 	{ S_PCT3,	{ ':' },	STO(DIG_DSHARP)		},

 	/* after a & */
 	{ S_AMPER,	{ ANY },	FRZ(STO(AND))		},
 	{ S_AMPER,	{ '=' },	STO(ASAND)		},
 	{ S_AMPER,	{ '&' },	STO(LAND)		},

 	/* after a ' */
 	{ S_CHAR,	{ ANY },	PUT(S_CHAR)		},
 	{ S_CHAR,	{ VCH },	FRZ(S_TRUNC)		},
 	{ S_CHAR,	{ '\'' },	PUT(STO(CHAR))		},
 	{ S_CHAR,	{ '\\' },	PUT(S_CHAR2)		},

 	/* after a \ in a character constant
 	   useful only for '\'' */
 	{ S_CHAR2,	{ ANY },	PUT(S_CHAR)		},
 	{ S_CHAR2,	{ VCH },	FRZ(S_TRUNC)		},

 	/* after a * */
 	{ S_STAR,	{ ANY },	FRZ(STO(STAR))		},
 	{ S_STAR,	{ '=' },	STO(ASSTAR)		},

 	/* after a + */
 	{ S_PLUS,	{ ANY },	FRZ(STO(PLUS))		},
 	{ S_PLUS,	{ '+' },	STO(PPLUS)		},
 	{ S_PLUS,	{ '=' },	STO(ASPLUS)		},

 	/* after a - */
 	{ S_MINUS,	{ ANY },	FRZ(STO(MINUS))		},
 	{ S_MINUS,	{ '-' },	STO(MMINUS)		},
 	{ S_MINUS,	{ '=' },	STO(ASMINUS)		},
 	{ S_MINUS,	{ '>' },	STO(ARROW)		},

 	/* after a . */
 	{ S_DOT,	{ ANY },	FRZ(STO(DOT))		},
 	{ S_DOT,	{ NUM },	PUT(S_NUMBER)		},
 	{ S_DOT,	{ '.' },	S_DOT2			},

 	/* after .. */
 	{ S_DOT2,	{ ANY },	FRZ(S_DDOT)		},
 	{ S_DOT2,	{ '.' },	STO(MDOTS)		},

 	/* after a / */
 	{ S_SLASH,	{ ANY },	FRZ(STO(SLASH))		},
 	{ S_SLASH,	{ '=' },	STO(ASSLASH)		},
 #ifdef SEMPER_FIDELIS
 	{ S_SLASH,	{ '*' },	PUT(S_COMMENT)		},
 	{ S_SLASH,	{ '/' },	PUT(S_COMMENT5)		},
 #else
 	{ S_SLASH,	{ '*' },	S_COMMENT		},
 	{ S_SLASH,	{ '/' },	S_COMMENT5		},
 #endif
 	/*
 	 * There is a little hack in read_token() to disable
 	 * this last rule, if C++ (C99) comments are not enabled.
 	 */

 	/* after a number */
 	{ S_NUMBER,	{ ANY },	FRZ(STO(NUMBER))	},
 	{ S_NUMBER,	{ ALP, NUM },	PUT(S_NUMBER)		},
 	{ S_NUMBER,	{ '.' },	PUT(S_NUMBER)		},
 	{ S_NUMBER,	{ 'E', 'e' },	PUT(S_NUMBER2)		},
 	{ S_NUMBER,	{ 'P', 'p' },	PUT(S_NUMBER2)		},

 	{ S_NUMBER2,	{ ANY },	FRZ(STO(NUMBER))	},
 	{ S_NUMBER2,	{ ALP, NUM },	PUT(S_NUMBER)		},
 	{ S_NUMBER2,	{ '+', '-' },	PUT(S_NUMBER)		},

 	/* after a < */
 	{ S_LT,		{ ANY },	FRZ(STO(LT))		},
 	{ S_LT,		{ '=' },	STO(LEQ)		},
 	{ S_LT,		{ '<' },	S_LT2			},
 	{ S_LT,		{ ':' },	STO(DIG_LBRK)		},
 	{ S_LT,		{ '%' },	STO(DIG_LBRA)		},

 	{ S_LT2,	{ ANY },	FRZ(STO(LSH))		},
 	{ S_LT2,	{ '=' },	STO(ASLSH)		},

 	/* after a > */
 	{ S_GT,		{ ANY },	FRZ(STO(GT))		},
 	{ S_GT,		{ '=' },	STO(GEQ)		},
 	{ S_GT,		{ '>' },	S_GT2			},

 	{ S_GT2,	{ ANY },	FRZ(STO(RSH))		},
 	{ S_GT2,	{ '=' },	STO(ASRSH)		},

 	/* after a = */
 	{ S_EQ,		{ ANY },	FRZ(STO(ASGN))		},
 	{ S_EQ,		{ '=' },	STO(SAME)		},
 #ifdef CAST_OP
 	{ S_EQ,		{ '>' },	STO(CAST)		},
 #endif

 	/* after a \ */
 	{ S_BACKSLASH,	{ ANY },	FRZ(S_BS)		},
 	{ S_BACKSLASH,	{ 'U', 'u' },	FRZ(S_NAME_BS)		},

 	/* after a letter */
 	{ S_NAME,	{ ANY },	FRZ(STO(NAME))		},
 	{ S_NAME,	{ ALP, NUM },	PUT(S_NAME)		},
 	{ S_NAME,	{ '\\' },	S_NAME_BS		},
 #ifdef HAVE_ECPP
 	{ S_NAME,	{ ':' },	PUT(S_NAME)		},
 #endif

 	/* after a \ in an identifier */
 	{ S_NAME_BS,	{ ANY },	FRZ(S_ROGUE_BS)		},
 	{ S_NAME_BS,	{ 'u', 'U' },	PUT(S_NAME)		},

 	/* after a L */
 	{ S_LCHAR,	{ ANY },	FRZ(S_NAME)		},
 	{ S_LCHAR,	{ '"' },	PUT(S_STRING)		},
 	{ S_LCHAR,	{ '\'' },	PUT(S_CHAR)		},

 	/* after a ^ */
 	{ S_CIRC,	{ ANY },	FRZ(STO(CIRC))		},
 	{ S_CIRC,	{ '=' },	STO(ASCIRC)		},

 	/* after a | */
 	{ S_PIPE,	{ ANY },	FRZ(STO(OR))		},
 	{ S_PIPE,	{ '=' },	STO(ASOR)		},
 	{ S_PIPE,	{ '|' },	STO(LOR)		},

 	/* after a ~ */
 	{ S_TILDE,	{ ANY },	FRZ(STO(NOT))		},
 	{ S_TILDE,	{ '=' },	STO(ASNOT)		},

 	/* after a / and * */
 #ifdef SEMPER_FIDELIS
 	{ S_COMMENT,	{ ANY },	PUT(S_COMMENT)		},
 	{ S_COMMENT,	{ VCH },	FRZ(S_TRUNCC)		},
 	{ S_COMMENT,	{ '*' },	PUT(S_COMMENT2)		},

 	{ S_COMMENT2,	{ ANY },	FRZ(S_COMMENT)		},
 	{ S_COMMENT2,	{ VCH },	FRZ(S_TRUNCC)		},
 	{ S_COMMENT2,	{ '*' },	PUT(S_COMMENT2)		},
 	{ S_COMMENT2,	{ '/' },	STO(PUT(COMMENT))	},

 	{ S_COMMENT5,	{ ANY },	PUT(S_COMMENT5)		},
 	{ S_COMMENT5,	{ VCH },	FRZ(S_DECAY)		},
 	{ S_COMMENT5,	{ '\n' },	FRZ(STO(COMMENT))	},
 #else
 	{ S_COMMENT,	{ ANY },	S_COMMENT		},
 	{ S_COMMENT,	{ VCH },	FRZ(S_TRUNCC)		},
 	{ S_COMMENT,	{ '*' },	S_COMMENT2		},

 	{ S_COMMENT2,	{ ANY },	FRZ(S_COMMENT)		},
 	{ S_COMMENT2,	{ VCH },	FRZ(S_TRUNCC)		},
 	{ S_COMMENT2,	{ '*' },	S_COMMENT2		},
 	{ S_COMMENT2,	{ '/' },	STO(COMMENT)		},

 	{ S_COMMENT5,	{ ANY },	S_COMMENT5		},
 	{ S_COMMENT5,	{ VCH },	FRZ(S_DECAY)		},
 	{ S_COMMENT5,	{ '\n' },	FRZ(STO(COMMENT))	},
 #endif

 	/* dummy end of machine description */
 	{ 0,		{ 0 },		0			}
 };

 /*
  * cppm is the table used to store the automaton: if we are in state s
  * and we read character c, we apply the action cppm[s][c] (jumping to
  * another state, or emitting a token).
  * cppm_vch is the table for the special virtual character "end of input"
  */
 static int cppm[MSTATE][MAX_CHAR_VAL];
 static int cppm_vch[MSTATE];

 /*
  * init_cppm() fills cppm[][] with the information stored in cppms[].
  * It must be called before beginning the lexing process.
  */
 void init_cppm(void)
 {
 	int i, j, k, c;
 	static unsigned char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 	static unsigned char lower[] = "abcdefghijklmnopqrstuvwxyz";
 	unsigned char *cp;

 	for (i = 0; i < MSTATE; i ++) {
 		for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[i][j] = S_OUCH;
 		cppm_vch[i] = S_OUCH;
 	}
 	for (i = 0; cppms[i].input[0]; i ++) for (k = 0; k < CMCR; k ++) {
 		int s = cppms[i].state;
 		int ns = cppms[i].new_state;

 		switch (c = cppms[i].input[k]) {
 		case 0:
 			break;
 		case SPC:
 			/* see space_char() also */
 			cppm[s][' '] = ns;
 			cppm[s]['\t'] = ns;
 			cppm[s]['\v'] = ns;
 			cppm[s]['\f'] = ns;
 #ifdef UNBREAKABLE_SPACE
 			if (MAX_CHAR_VAL > UNBREAKABLE_SPACE)
 				cppm[s][UNBREAKABLE_SPACE] = ns;
 #endif
 			break;
 		case ALP:
 			for (cp = upper; *cp; cp ++) cppm[s][(int)*cp] = ns;
 			for (cp = lower; *cp; cp ++) cppm[s][(int)*cp] = ns;
 			cppm[s]['_'] = ns;
 			break;
 		case NUM:
 			for (j = '0'; j <= '9'; j ++) cppm[s][j] = ns;
 			break;
 		case ANY:
 			for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[s][j] = ns;
 			cppm_vch[s] = ns;
 			break;
 		case VCH:
 			cppm_vch[s] = ns;
 			break;
 		default:
 			cppm[s][c] = ns;
 			break;
 		}
 	}
 }

 int space_char(int c)
 {
 	if (c == ' ' || c == '\t' || c == '\v' || c == '\f'
 #ifdef UNBREAKABLE_SPACE
 		|| c == UNBREAKABLE_SPACE
 #endif
 		) return 1;
 	return 0;
 }

 #ifndef NO_UCPP_BUF
 /*
  * our output buffer is full, flush it
  */
 void flush_output(struct lexer_state *ls)
 {
 	size_t x = ls->sbuf, y = 0, z;

 	if (ls->sbuf == 0) return;
 	do {
 		z = fwrite(ls->output_buf + y, 1, x, ls->output);
 		x -= z;
 		y += z;
 	} while (z && x > 0);
 	if (!y) {
 		error(ls->line, "could not flush output (disk full ?)");
 		die();
 	}
 	ls->sbuf = 0;
 }
 #endif

 /*
  * Output one character; flush the buffer if needed.
  * This function should not be called, except by put_char().
  */
 static inline void write_char(struct lexer_state *ls, unsigned char c)
 {
 #ifndef NO_UCPP_BUF
 	ls->output_buf[ls->sbuf ++] = c;
 	if (ls->sbuf == OUTPUT_BUF_MEMG) flush_output(ls);
 #else
 	if (putc((int)c, ls->output) == EOF) {
 		error(ls->line, "output write error (disk full ?)");
 		die();
 	}
 #endif
 	if (c == '\n') {
 		ls->oline ++;
 	}
 }

 /*
  * schedule a character for output
  */
 void put_char(struct lexer_state *ls, unsigned char c)
 {
 	if (ls->flags & KEEP_OUTPUT) write_char(ls, c);
 }

 /*
  * get next raw input character
  */
 static inline int read_char(struct lexer_state *ls)
 {
 	unsigned char c;

 	if (!ls->input) {
 		return ((ls->pbuf ++) < ls->ebuf) ?
 			ls->input_string[ls->pbuf - 1] : -1;
 	}
 	while (1) {
 #ifndef NO_UCPP_BUF
 		if (ls->pbuf == ls->ebuf) {
 #ifdef UCPP_MMAP
 			if (ls->from_mmap) {
 				munmap((void *)ls->input_buf, ls->ebuf);
 				ls->from_mmap = 0;
 				ls->input_buf = ls->input_buf_sav;
 			}
 #endif
 			ls->ebuf = fread(ls->input_buf, 1,
 				INPUT_BUF_MEMG, ls->input);
 			ls->pbuf = 0;
 		}
 		if (ls->ebuf == 0) return -1;
 		c = ls->input_buf[ls->pbuf ++];
 #else
 		int x = getc(ls->input);

 		if (x == EOF) return -1;
 		c = x;
 #endif
 		if (ls->flags & COPY_LINE) {
 			if (c == '\n') {
 				ls->copy_line[ls->cli] = 0;
 				ls->cli = 0;
 			} else if (ls->cli < (COPY_LINE_LENGTH - 1)) {
 				ls->copy_line[ls->cli ++] = c;
 			}
 		}
 		if (ls->macfile && c == '\n') {
 			ls->macfile = 0;
 			continue;
 		}
 		ls->macfile = 0;
 		if (c == '\r') {
 			/*
 			 * We found a '\r'; we handle it as a newline
 			 * and ignore the next newline. This should work
 			 * with all combinations of Msdos, MacIntosh and
 			 * Unix files on these three platforms. On other
 			 * platforms, native file formats are always
 			 * supported.
 			 */
 			ls->macfile = 1;
 			c = '\n';
 		}
 		break;
 	}
 	/*vb*/
 #ifdef HAVE_MISRA
 	if(misracheck){
 	  if(!isspace((unsigned char)c)&&
 	     (c<'A'||c>'Z')&&
 	     (c<'a'||c>'z')&&
 	     (c<'0'||c>'9')&&
 	     c!='!'&&c!='\"'&&c!='#'&&c!='%'&&c!='&'&&c!='\''&&c!='('&&c!=')'&&
 	     c!='*'&&c!='+'&&c!=','&&c!='-'&&c!='.'&&c!='/'&&c!=':'&&
 	     c!=';'&&c!='<'&&c!='='&&c!='>'&&c!='?'&&c!='['&&c!=']'&&
 	     c!='^'&&c!='_'&&c!='{'&&c!='|'&&c!='}'&&c!='~'&&c!='\\'
 	     )
 	    misra_neu(5,4,1,ls->line,c,' ');
 	}
 #endif
 	return c;
 }

 /*
  * next_fifo_char(), char_lka1() and char_lka2() give a two character
  * look-ahead on the input stream; this is needed for trigraphs
  */
 static inline int next_fifo_char(struct lexer_state *ls)
 {
 	int c;

 	if (ls->nlka != 0) {
 		c = ls->lka[0];
 		ls->lka[0] = ls->lka[1];
 		ls->nlka --;
 	} else c = read_char(ls);
 	return c;
 }

 static inline int char_lka1(struct lexer_state *ls)
 {
 	if (ls->nlka == 0) {
 		ls->lka[0] = read_char(ls);
 		ls->nlka ++;
 	}
 	return ls->lka[0];
 }

 static inline int char_lka2(struct lexer_state *ls)
 {
 #ifdef AUDIT
 	if (ls->nlka == 0) ouch("always in motion future is");
 #endif
 	if (ls->nlka == 1) {
 		ls->lka[1] = read_char(ls);
 		ls->nlka ++;
 	}
 	return ls->lka[1];
 }

 static struct trigraph {
 	int old, new;
 } trig[9] = {
 	{ '=', '#' },
 	{ '/', '\\' },
 	{ '\'', '^' },
 	{ '(', '[' },
 	{ ')', ']' },
 	{ '!', '|' },
 	{ '<', '{' },
 	{ '>', '}' },
 	{ '-', '~' }
 };

 /*
  * Returns the next character, after treatment of trigraphs and terminating
  * backslashes. Return value is -1 if there is no more input.
  */
 static inline int next_char(struct lexer_state *ls)
 {
 	int c;

 	if (!ls->discard) return ls->last;
 	ls->discard = 0;
 	do {
 		c = next_fifo_char(ls);
 #ifdef HAVE_ECPP
 		if(c==':'){
 			if(char_lka1(ls)==':'){
 				next_fifo_char(ls);
 			}else{
 				c='@';
 			}
 		}
 #endif
 		/* check trigraphs */
 		if (c == '?' && char_lka1(ls) == '?'
 			&& (ls->flags & HANDLE_TRIGRAPHS)) {
 			int i, d;

 			d = char_lka2(ls);
 			for (i = 0; i < 9; i ++) if (d == trig[i].old) {
 				if (ls->flags & WARN_TRIGRAPHS) {
 					ls->count_trigraphs ++;
 				}
 				if (ls->flags & WARN_TRIGRAPHS_MORE) {
 					warning(ls->line, "trigraph ?""?%c "
 						"encountered", d);
 					/*vb*/
 #ifdef HAVE_MISRA
 					misra_neu(7,4,2,-1,d);
 #endif
 				}
 				next_fifo_char(ls);
 				next_fifo_char(ls);
 				c = trig[i].new;
 				break;
 			}
 		}
 		if (c == '\\' && char_lka1(ls) == '\n') {
 			ls->line ++;
 			next_fifo_char(ls);
 		} else {
 			ls->last = c;
 			return c;
 		}
 	} while (1);
 }

 /*
  * wrapper for next_char(), to be called from outside
  * (used by #error, #include directives)
  */
 int grap_char(struct lexer_state *ls)
 {
 	return next_char(ls);
 }

 /*
  * Discard the current character, so that the next call to next_char()
  * will step into the input stream.
  */
 void discard_char(struct lexer_state *ls)
 {
 #ifdef AUDIT
 	if (ls->discard) ouch("overcollecting garbage");
 #endif
 	ls->discard = 1;
 	ls->utf8 = 0;
 	if (ls->last == '\n') ls->line ++;
 }

 /*
  * Convert an UTF-8 encoded character to a Universal Character Name
  * using \u (or \U when appropriate).
  */
 static int utf8_to_string(unsigned char buf[], unsigned long utf8)
 {
 	unsigned long val = 0;
 	static char hex[16] = "0123456789abcdef";

 	if (utf8 & 0x80UL) {
 		unsigned long x1, x2, x3, x4;

 		x1 = (utf8 >> 24) & 0x7fUL;
 		x2 = (utf8 >> 16) & 0x7fUL;
 		x3 = (utf8 >> 8) & 0x7fUL;
 		x4 = (utf8) & 0x3fUL;
 		x1 &= 0x07UL;
 		if (x2 & 0x40UL) x2 &= 0x0fUL;
 		if (x3 & 0x40UL) x3 &= 0x1fUL;
 		val = x4 | (x3 << 6) | (x2 << 12) | (x1 << 16);
 	} else val = utf8;
 	if (val < 128) {
 		buf[0] = val;
 		buf[1] = 0;
 		return 1;
 	} else if (val < 0xffffUL) {
 		buf[0] = '\\';
 		buf[1] = 'u';
 		buf[2] = hex[(size_t)(val >> 12)];
 		buf[3] = hex[(size_t)((val >> 8) & 0xfU)];
 		buf[4] = hex[(size_t)((val >> 4) & 0xfU)];
 		buf[5] = hex[(size_t)(val & 0xfU)];
 		buf[6] = 0;
 		return 6;
 	}
 	buf[0] = '\\';
 	buf[1] = 'U';
 	buf[2] = '0';
 	buf[3] = '0';
 	buf[4] = hex[(size_t)(val >> 20)];
 	buf[5] = hex[(size_t)((val >> 16) & 0xfU)];
 	buf[6] = hex[(size_t)((val >> 12) & 0xfU)];
 	buf[7] = hex[(size_t)((val >> 8) & 0xfU)];
 	buf[8] = hex[(size_t)((val >> 4) & 0xfU)];
 	buf[9] = hex[(size_t)(val & 0xfU)];
 	buf[10] = 0;
 	return 10;
 }

 /*
  * Scan the identifier and put it in canonical form:
  *  -- tranform \U0000xxxx into \uxxxx
  *  -- inside \u and \U, make letters low case
  *  -- report (some) incorrect use of UCN
  */
 static void canonize_id(struct lexer_state *ls, char *id)
 {
 	char *c, *d;

 	for (c = d = id; *c;) {
 		if (*c == '\\') {
 			int i;

 			if (!*(c + 1)) goto canon_error;
 			if (*(c + 1) == 'U') {
 				for (i = 0; i < 8 && *(c + i + 2); i ++);
 				if (i != 8) goto canon_error;
 				*(d ++) = '\\';
 				c += 2;
 				for (i = 0; i < 4 && *(c + i) == '0'; i ++);
 				if (i == 4) {
 					*(d ++) = 'u';
 					c += 4;
 				} else {
 					*(d ++) = 'U';
 					i = 8;
 				}
 				for (; i > 0; i --) {
 					switch (*c) {
 					case 'A': *(d ++) = 'a'; break;
 					case 'B': *(d ++) = 'b'; break;
 					case 'C': *(d ++) = 'c'; break;
 					case 'D': *(d ++) = 'd'; break;
 					case 'E': *(d ++) = 'e'; break;
 					case 'F': *(d ++) = 'f'; break;
 					default: *(d ++) = *c; break;
 					}
 					c ++;
 				}
 			} else if (*(c + 1) == 'u') {
 				for (i = 0; i < 4 && *(c + i + 2); i ++);
 				if (i != 4) goto canon_error;
 				*(d ++) = '\\';
 				*(d ++) = 'u';
 				c += 2;
 				for (; i > 0; i --) {
 					switch (*c) {
 					case 'A': *(d ++) = 'a'; break;
 					case 'B': *(d ++) = 'b'; break;
 					case 'C': *(d ++) = 'c'; break;
 					case 'D': *(d ++) = 'd'; break;
 					case 'E': *(d ++) = 'e'; break;
 					case 'F': *(d ++) = 'f'; break;
 					default: *(d ++) = *c; break;
 					}
 					c ++;
 				}
 			} else goto canon_error;
 			continue;
 		}
 		*(d ++) = *(c ++);
 	}
 	*d = 0;
 	return;

 canon_error:
 	for (; *c; *(d ++) = *(c ++));
 	if (ls->flags & WARN_STANDARD) {
 		warning(ls->line, "malformed identifier with UCN: '%s'", id);
 	}
 	*d = 0;
 }

 /*
  * Run the automaton, in order to get the next token.
  * This function should not be called, except by next_token()
  *
  * return value: 1 on error, 2 on end-of-file, 0 otherwise.
  */
 static inline int read_token(struct lexer_state *ls)
 {
 	int cstat = S_START, nstat;
 	size_t ltok = 0;
 	int c, outc = 0, ucn_in_id = 0;
 	int shift_state;
 	unsigned long utf8;
 	long l = ls->line;

 	ls->ctok->line = l;
 	if (ls->pending_token) {
 		if ((ls->ctok->type = ls->pending_token) == BUNCH) {
 			ls->ctok->name[0] = '\\';
 			ls->ctok->name[1] = 0;
 		}
 		ls->pending_token = 0;
 		return 0;
 	}
 	if (ls->flags & UTF8_SOURCE) {
 		utf8 = ls->utf8;
 		shift_state = 0;
 	}
 	if (!(ls->flags & LEXER) && (ls->flags & KEEP_OUTPUT))
 		for (; ls->line > ls->oline;) put_char(ls, '\n');
 	do {
 		c = next_char(ls);
 		if (c < 0) {
 			if ((ls->flags & UTF8_SOURCE) && shift_state) {
 				if (ls->flags & WARN_STANDARD)
 					warning(ls->line, "truncated UTF-8 "
 						"character");
 				shift_state = 0;
 				utf8 = 0;
 			}
 			if (cstat == S_START) return 2;
 			nstat = cppm_vch[cstat];
 		} else {
 			if (ls->flags & UTF8_SOURCE) {
 				if (shift_state) {
 					if ((c & 0xc0) != 0x80) {
 						if (ls->flags & WARN_STANDARD)
 							warning(ls->line,
 								"truncated "
 								"UTF-8 "
 								"character");
 						shift_state = 0;
 						utf8 = 0;
 						c = '_';
 					} else {
 						utf8 = (utf8 << 8) | c;
 						if (-- shift_state) {
 							ls->discard = 1;
 							continue;
 						}
 						c = '_';
 					}
 				} else if ((c & 0xc0) == 0xc0) {
 					if ((c & 0x30) == 0x30) {
 						shift_state = 3;
 					} else if (c & 0x20) {
 						shift_state = 2;
 					} else {
 						shift_state = 1;
 					}
 					utf8 = c;
 					ls->discard = 1;
 					continue;
 				} else utf8 = 0;
 			}
 			nstat = cppm[cstat][c < MAX_CHAR_VAL ? c : 0];
 		}
 #ifdef AUDIT
 		if (nstat == S_OUCH) {
 			ouch("bad move...");
 		}
 #endif
 		/*
 		 * disable C++-like comments
 		 */
 		if (nstat == S_COMMENT5 && !(ls->flags & CPLUSPLUS_COMMENTS))
 			nstat = FRZ(STO(SLASH));

 		if (noMOD(nstat) >= MSTATE && !ttSTO(nstat))
 			switch (noMOD(nstat)) {
 		case S_ILL:
 			if (ls->flags & CCHARSET) {
 				error(ls->line, "illegal character '%c'", c);
 				return 1;
 			}
 			nstat = PUT(STO(BUNCH));
 			break;
 		case S_BS:
 			ls->ctok->name[0] = '\\';
 			ltok ++;
 			nstat = FRZ(STO(BUNCH));
 			if (!(ls->flags & LEXER)) put_char(ls, '\\');
 			break;
 		case S_ROGUE_BS:
 			ls->pending_token = BUNCH;
 			nstat = FRZ(STO(NAME));
 			break;
 		case S_DDOT:
 			ls->pending_token = DOT;
 			nstat = FRZ(STO(DOT));
 			break;
 		case S_DDSHARP:
 			ls->pending_token = PCT;
 			nstat = FRZ(STO(DIG_SHARP));
 			break;
 		case S_BEHEAD:
 			error(l, "unfinished string at end of line");
 			return 1;
 		case S_DECAY:
 			warning(l, "unterminated // comment");
 			nstat = FRZ(STO(COMMENT));
 			break;
 		case S_TRUNC:
 			error(l, "truncated token");
 			return 1;
 		case S_TRUNCC:
 			error(l, "truncated comment");
 			return 1;
 #ifdef AUDIT
 		case S_OUCH:
 			ouch("machine went out of control");
 			break;
 #endif
 		}
 		if (!ttFRZ(nstat)) {
 			discard_char(ls);
 			if (!(ls->flags & LEXER) && ls->condcomp) {
 				int z = ttSTO(nstat) ? S_ILL : noMOD(nstat);

 				if (cstat == S_NAME || z == S_NAME
 					|| ((CMT(cstat) || CMT(z))
 					&& (ls->flags & DISCARD_COMMENTS))) {
 					outc = 0;
 				} else if (z == S_LCHAR || z == S_SLASH
 					|| (z == S_SHARP && ls->ltwnl)
 					|| (z == S_PCT && ls->ltwnl)
 					|| (z == S_BACKSLASH)) {
 					outc = c;
 				} else if (z == S_PCT2 && ls->ltwnl) {
 					outc = -1;
 				} else if (z == S_PCT3 && ls->ltwnl) {
 					/* we have %:% but this still might
 					   not be a %:%: */
 					outc = -2;
 				} else {
 					if (outc < 0) {
 						put_char(ls, '%');
 						put_char(ls, ':');
 						if (outc == -2)
 							put_char(ls, '%');
 						outc = 0;
 					} else if (outc) {
 						put_char(ls, outc);
 						outc = 0;
 					}
 					put_char(ls, c);
 				}
 			}
 		} else if (outc == '/' && !(ls->flags & LEXER)
 			&& ls->condcomp) {
 			/* this is a hack: we need to dump a pending slash */
 			put_char(ls, outc);
 			outc = 0;
 		}
 		if (ttPUT(nstat)) {
 			if (cstat == S_NAME_BS) {
 				ucn_in_id = 1;
 				wan(ls->ctok->name, ltok, '\\', ls->tknl);
 			}
 			if ((ls->flags & UTF8_SOURCE) && utf8) {
 				unsigned char buf[11];
 				int i, j;

 				for (i = 0, j = utf8_to_string(buf, utf8);
 					i < j; i ++)
 					wan(ls->ctok->name, ltok, buf[i],
 						ls->tknl);
 				/* if (j > 1) ucn_in_id = 1; */
 			} else wan(ls->ctok->name, ltok,
 				(unsigned char)c, ls->tknl);
 		}
 		if (ttSTO(nstat)) {
 			if (S_TOKEN(noMOD(nstat))) {
 				wan(ls->ctok->name, ltok,
 					(unsigned char)0, ls->tknl);
 			}
 			ls->ctok->type = noMOD(nstat);
 			break;
 		}
 		cstat = noMOD(nstat);
 	} while (1);
 	if (!(ls->flags & LEXER) && (ls->flags & DISCARD_COMMENTS)
 			&& ls->ctok->type == COMMENT) put_char(ls, ' ');
 	if (ucn_in_id && ls->ctok->type == NAME)
 		canonize_id(ls, ls->ctok->name);
 	return 0;
 }

 /*
  * fills ls->ctok with the next token
  */
 int next_token(struct lexer_state *ls)
 {
 	if (ls->flags & READ_AGAIN) {
 		ls->flags &= ~READ_AGAIN;
 		if (!(ls->flags & LEXER)) {
 			char *c = S_TOKEN(ls->ctok->type) ?
 				ls->ctok->name : token_name(ls->ctok);
 			if (ls->ctok->type == OPT_NONE) {
 				ls->ctok->type = NONE;
 #ifdef SEMPER_FIDELIS
 				ls->ctok->name[0] = ' ';
 				ls->ctok->name[1] = 0;
 #endif
 				put_char(ls, ' ');
 			} else if (ls->ctok->type != NAME &&
 				!(ls->ltwnl && (ls->ctok->type == SHARP
 					|| ls->ctok->type == DIG_SHARP)))
 				for (; *c; c ++) put_char(ls, *c);
 		}
 		return 0;
 	}
 	return read_token(ls);
 }