Blame - contiki/apps/htmlparser.c - contiki-1.x

blob: cb48841f30b287913f62e23776c950e5a8f51ae9 [file] [log] [blame]

adamdunkels	ca9ddcb	2003-03-19 14:13:31 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2002, Adam Dunkels.
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* 1. Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* 2. Redistributions in binary form must reproduce the above
				11	* copyright notice, this list of conditions and the following
				12	* disclaimer in the documentation and/or other materials provided
				13	* with the distribution.
				14	* 3. All advertising materials mentioning features or use of this
				15	* software must display the following acknowledgement:
				16	* This product includes software developed by Adam Dunkels.
				17	* 4. The name of the author may not be used to endorse or promote
				18	* products derived from this software without specific prior
				19	* written permission.
				20	*
				21	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
				22	* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
				23	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				24	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
				25	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
				26	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
				27	* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				28	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
				29	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				30	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				31	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				32	*
				33	* This file is part of the Contiki desktop environment
				34	*
adamdunkels	fe6cd59	2003-08-09 13:29:53 +0000	[diff] [blame^]	35	* $Id: htmlparser.c,v 1.3 2003/08/09 13:29:53 adamdunkels Exp $
adamdunkels	ca9ddcb	2003-03-19 14:13:31 +0000	[diff] [blame]	36	*
				37	*/
				38
				39	/* htmlparser.c:
				40	*
				41	* Implements a very simplistic HTML parser. It recognizes HTML links
				42	* (<a href>-tags), HTML img alt tags, a few text flow break tags
				43	G * (<br>, <p>, <h>), the <li> tag (but does not even try to
				44	* distinguish between <ol> or <ul>) as well as HTML comment tags
				45	* (<!-- -->).
				46	*
				47	* To save memory, the HTML parser is state machine driver, which
				48	* means that it will shave off one character from the HTML page,
				49	* process that character, and return to the next. Another way of
				50	* doing it would be to buffer a number of characters and process them
				51	* together.
				52	*
				53	* The main function in this file is the htmlparser_parse() function
				54	* which takes a htmlparser_state structur and a part of an HTML file
				55	* as an argument. The htmlparser_parse() function will call the
				56	* helper functions parse_char() and parse_tag(). Those functions will
				57	* in turn call the two callback functions htmlparser_char() and
				58	* htmlparser_tag(). Those functions must be implemented by the using
				59	* module (e.g., a web browser program).
				60	*
				61	* htmlparser_char() will be called for every non-tag character.
				62	*
				63	* htmlparser_tag() will be called whenever a full tag has been found.
				64	*
				65	*/
				66
				67
				68	#include "htmlparser.h"
				69	#include "html-strings.h"
				70	#include "www-conf.h"
adamdunkels	9d1aaef	2003-04-05 12:21:37 +0000	[diff] [blame]	71	#include "cc.h"
adamdunkels	ca9ddcb	2003-03-19 14:13:31 +0000	[diff] [blame]	72
				73	#if 1
				74	#define PRINTF(x)
				75	#else
				76	#include <stdio.h>
				77	#define PRINTF(x) printf x
				78	#endif
				79
adamdunkels	ca9ddcb	2003-03-19 14:13:31 +0000	[diff] [blame]	80
				81	/-----------------------------------------------------------------------------------/
				82	#define ISO_A 0x41
				83	#define ISO_B 0x42
				84	#define ISO_E 0x45
				85	#define ISO_F 0x46
				86	#define ISO_G 0x47
				87	#define ISO_H 0x48
				88	#define ISO_I 0x49
				89	#define ISO_L 0x4c
				90	#define ISO_M 0x4d
				91	#define ISO_P 0x50
				92	#define ISO_R 0x52
				93	#define ISO_T 0x54
				94
				95	#define ISO_a (ISO_A \| 0x20)
				96	#define ISO_b (ISO_B \| 0x20)
				97	#define ISO_e (ISO_E \| 0x20)
				98	#define ISO_f (ISO_F \| 0x20)
				99	#define ISO_g (ISO_G \| 0x20)
				100	#define ISO_h (ISO_H \| 0x20)
				101	#define ISO_i (ISO_I \| 0x20)
				102	#define ISO_l (ISO_L \| 0x20)
				103	#define ISO_m (ISO_M \| 0x20)
				104	#define ISO_p (ISO_P \| 0x20)
				105	#define ISO_r (ISO_R \| 0x20)
				106	#define ISO_t (ISO_T \| 0x20)
				107
				108	#define ISO_ht 0x09
				109	#define ISO_nl 0x0a
				110	#define ISO_cr 0x0d
				111	#define ISO_space 0x20
				112	#define ISO_bang 0x21
				113	#define ISO_citation 0x22
				114	#define ISO_ampersand 0x26
				115	#define ISO_citation2 0x27
				116	#define ISO_asterisk 0x2a
				117	#define ISO_dash 0x2d
				118	#define ISO_slash 0x2f
				119	#define ISO_semicolon 0x3b
				120	#define ISO_lt 0x3c
				121	#define ISO_eq 0x3d
				122	#define ISO_gt 0x3e
				123
				124	#define ISO_rbrack 0x5b
				125	#define ISO_lbrack 0x5d
				126
				127	#define MINORSTATE_NONE 0
				128	#define MINORSTATE_TEXT 1 /* Parse normal text */
				129	#define MINORSTATE_EXTCHAR 2 /* Check for semi-colon */
				130	#define MINORSTATE_TAG 3 /* Check for name of tag. */
				131	#define MINORSTATE_TAGEND 4 /* Scan for end of tag. */
				132	#define MINORSTATE_TAGATTR 5 /* Parse tag attr. */
				133	#define MINORSTATE_TAGATTRSPACE 6 /* Parse optional space after tag
				134	attr. */
				135	#define MINORSTATE_TAGATTRPARAM 7 /* Parse tag attr parameter. */
				136	#define MINORSTATE_TAGATTRPARAMNQ 8 /* Parse tag attr parameter without
				137	quotation marks. */
				138	#define MINORSTATE_HTMLCOMMENT 9 /* Scan for HTML comment end */
				139
				140	#define MAJORSTATE_NONE 0
				141	#define MAJORSTATE_BODY 1
				142	#define MAJORSTATE_LINK 2
				143	#define MAJORSTATE_FORM 3
				144	#define MAJORSTATE_DISCARD 4
				145
				146
				147	struct htmlparser_state {
				148	unsigned char minorstate;
				149	char tag[20];
				150	unsigned char tagptr;
				151	char tagattr[20];
				152	unsigned char tagattrptr;
				153	char tagattrparam[WWW_CONF_MAX_URLLEN];
				154	unsigned char tagattrparamptr;
				155	unsigned char lastchar, quotechar;
				156	unsigned char majorstate, lastmajorstate;
				157	char linkurl[WWW_CONF_MAX_URLLEN];
				158	char linktext[40];
				159	unsigned char linktextptr;
				160	#if WWW_CONF_FORMS
				161	char formaction[WWW_CONF_MAX_FORMACTIONLEN];
				162	char formname[WWW_CONF_MAX_FORMNAMELEN];
				163	unsigned char inputtype;
				164	char inputname[WWW_CONF_MAX_INPUTNAMELEN];
				165	char inputvalue[WWW_CONF_MAX_INPUTVALUELEN];
				166	unsigned char inputvaluesize;
				167	#endif /* WWW_CONF_FORMS */
				168	};
				169
				170	static struct htmlparser_state s;
				171
				172	/-----------------------------------------------------------------------------------/
				173	static char last[1] = {0xff};
				174
				175	static char *tags[] = {
				176	#define TAG_FIRST 0
				177	#define TAG_SLASHA 0
				178	html_slasha,
				179	#define TAG_SLASHCENTER 1
				180	html_slashcenter,
				181	#define TAG_SLASHFORM 2
				182	html_slashform,
				183	#define TAG_SLASHH 3
				184	html_slashh,
				185	#define TAG_SLASHSCRIPT 4
				186	html_slashscript,
				187	#define TAG_SLASHSELECT 5
				188	html_slashselect,
				189	#define TAG_SLASHSTYLE 6
				190	html_slashstyle,
				191	#define TAG_A 7
				192	html_a,
				193	#define TAG_BODY 8
				194	html_body,
				195	#define TAG_BR 9
				196	html_br,
				197	#define TAG_CENTER 10
				198	html_center,
				199	#define TAG_FORM 11
				200	html_form,
				201	#define TAG_FRAME 12
				202	html_frame,
				203	#define TAG_H1 13
				204	html_h1,
				205	#define TAG_H2 14
				206	html_h2,
				207	#define TAG_H3 15
				208	html_h3,
				209	#define TAG_H4 16
				210	html_h4,
				211	#define TAG_IMG 17
				212	html_img,
				213	#define TAG_INPUT 18
				214	html_input,
				215	#define TAG_LI 19
				216	html_li,
				217	#define TAG_P 20
				218	html_p,
				219	#define TAG_SCRIPT 21
				220	html_script,
				221	#define TAG_SELECT 22
				222	html_select,
				223	#define TAG_STYLE 23
				224	html_style,
				225	#define TAG_TR 24
				226	html_tr,
				227	#define TAG_LAST 25
				228	last,
				229	};
				230
				231	/-----------------------------------------------------------------------------------/
adamdunkels	9d1aaef	2003-04-05 12:21:37 +0000	[diff] [blame]	232	static unsigned char CC_FASTCALL
adamdunkels	ca9ddcb	2003-03-19 14:13:31 +0000	[diff] [blame]	233	iswhitespace(char c)
				234	{
				235	return (c == ISO_space \|\|
				236	c == ISO_nl \|\|
				237	c == ISO_cr \|\|
				238	c == ISO_ht);
				239	}
				240	/-----------------------------------------------------------------------------------/
adamdunkels	9d1aaef	2003-04-05 12:21:37 +0000	[diff] [blame]	241	static unsigned char CC_FASTCALL
adamdunkels	ca9ddcb	2003-03-19 14:13:31 +0000	[diff] [blame]	242	find_tag(char *tag)
				243	{
				244	static unsigned char first, last, i, tabi;
				245	static char tagc;
				246
				247	tabi = first = TAG_FIRST;
				248	last = TAG_LAST;
				249	i = 0;
				250
				251	do {
				252	tagc = tag[i];
				253
				254	if(tagc == 0 &&
				255	tags[first][i] == 0) {
				256	return first;
				257	}
				258
				259	/* First, find first matching tag from table. */
				260	while(tagc > (tags[tabi])[i] &&
				261	tabi < last) {
				262	++tabi;
				263	}
				264	first = tabi;
				265
				266	/* Second, find last matching tag from table. */
				267	while(tagc == (tags[tabi])[i] &&
				268	tabi < last) {
				269	++tabi;
				270	}
				271	last = tabi;
				272
				273	/* If first and last matching tags are equal, we have a match and
				274	return. Else we continue with the next character. */
				275	++i;
				276	tabi = first;
				277	} while(last != first);
				278	return TAG_LAST;
				279	}
				280	/-----------------------------------------------------------------------------------/
adamdunkels	9d1aaef	2003-04-05 12:21:37 +0000	[diff] [blame]	281	static void CC_FASTCALL
adamdunkels	ca9ddcb	2003-03-19 14:13:31 +0000	[diff] [blame]	282	parse_char(unsigned char c)
				283	{
				284	if(c < 0x80) {
				285	if(s.majorstate == MAJORSTATE_LINK) {
				286	if(s.linktextptr < sizeof(s.linktext)) {
				287	if(iswhitespace(c)) {
				288	c = ISO_space;
				289	}
				290	s.linktext[s.linktextptr] = c;
				291	++s.linktextptr;
				292	}
				293	} else if(s.majorstate != MAJORSTATE_DISCARD) {
				294	htmlparser_char(c);
				295	}
				296	}
				297	}
				298	/-----------------------------------------------------------------------------------/
adamdunkels	fe6cd59	2003-08-09 13:29:53 +0000	[diff] [blame^]	299	static void CC_FASTCALL
adamdunkels	ca9ddcb	2003-03-19 14:13:31 +0000	[diff] [blame]	300	switch_majorstate(unsigned char newstate)
				301	{
				302	if(s.majorstate != newstate) {
				303	PRINTF(("Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate));
				304	s.lastmajorstate = s.majorstate;
				305	s.majorstate = newstate;
				306	}
				307	}
				308	/-----------------------------------------------------------------------------------/
				309	static void
				310	parse_tag(void)
				311	{
				312	static char *tagattrparam;
				313	static unsigned char size, i;
				314
				315	PRINTF(("Parsing tag '%s' '%s' '%s'\n",
				316	s.tag, s.tagattr, s.tagattrparam));
				317
				318	switch(find_tag(s.tag)) {
				319	case TAG_P:
				320	case TAG_H1:
				321	case TAG_H2:
				322	case TAG_H3:
				323	case TAG_H4:
				324	parse_char(ISO_nl);
				325	/* FALLTHROUGH */
				326	case TAG_BR:
				327	case TAG_TR:
				328	case TAG_SLASHH:
				329	parse_char(ISO_nl);
				330	break;
				331	case TAG_LI:
				332	parse_char(ISO_nl);
				333	parse_char(ISO_asterisk);
				334	parse_char(ISO_space);
				335	break;
				336	case TAG_SCRIPT:
				337	case TAG_STYLE:
				338	case TAG_SELECT:
				339	switch_majorstate(MAJORSTATE_DISCARD);
				340	break;
				341	case TAG_SLASHSCRIPT:
				342	case TAG_SLASHSTYLE:
				343	case TAG_SLASHSELECT:
				344	switch_majorstate(s.lastmajorstate);
				345	break;
				346	case TAG_BODY:
				347	s.majorstate = s.lastmajorstate = MAJORSTATE_BODY;
				348	break;
				349	case TAG_FRAME:
				350	if(strncmp(s.tagattr, html_src, sizeof(html_src)) == 0 &&
				351	s.tagattrparam[0] != 0) {
				352	switch_majorstate(MAJORSTATE_BODY);
				353	parse_char(ISO_nl);
				354	parse_char(ISO_rbrack);
				355	parse_char(ISO_space);
				356	htmlparser_link(html_frame, s.tagattrparam);
				357	PRINTF(("Frame [%s]\n", s.tagattrparam));
				358	parse_char(ISO_space);
				359	parse_char(ISO_lbrack);
				360	parse_char(ISO_nl);
				361	}
				362	break;
				363	case TAG_IMG:
				364	if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 &&
				365	s.tagattrparam[0] != 0) {
				366	parse_char(ISO_lt);
				367	tagattrparam = &s.tagattrparam[0];
				368	while(*tagattrparam) {
				369	parse_char(*tagattrparam);
				370	++tagattrparam;
				371	}
				372	parse_char(ISO_gt);
				373	}
				374	break;
				375	case TAG_A:
				376	PRINTF(("A %s %s\n", s.tagattr, s.tagattrparam));
				377	if(strncmp(s.tagattr, html_href, sizeof(html_href)) == 0 &&
				378	s.tagattrparam[0] != 0) {
				379	strcpy(s.linkurl, s.tagattrparam);
				380	switch_majorstate(MAJORSTATE_LINK);
				381	s.linktextptr = 0;
				382	}
				383	break;
				384	case TAG_SLASHA:
				385	if(s.majorstate == MAJORSTATE_LINK) {
				386	switch_majorstate(s.lastmajorstate);
				387	s.linktext[s.linktextptr] = 0;
				388	htmlparser_link(s.linktext, s.linkurl);
				389	PRINTF(("Link '%s' [%s]\n", s.linktext, s.linkurl));
				390	}
				391	break;
				392	#if WWW_CONF_FORMS
				393	case TAG_FORM:
				394	PRINTF(("Form tag\n"));
				395	switch_majorstate(MAJORSTATE_FORM);
				396	if(strncmp(s.tagattr, html_action, sizeof(html_action)) == 0) {
				397	PRINTF(("Form action '%s'\n", s.tagattrparam));
				398	strncpy(s.formaction, s.tagattrparam, WWW_CONF_MAX_FORMACTIONLEN - 1);
				399	} else if(strncmp(s.tagattr, html_name, sizeof(html_name)) == 0) {
				400	PRINTF(("Form name '%s'\n", s.tagattrparam));
				401	strncpy(s.formname, s.tagattrparam, WWW_CONF_MAX_FORMNAMELEN - 1);
				402	}
				403	s.inputname[0] = s.inputvalue[0] = 0;
				404	break;
				405	case TAG_SLASHFORM:
				406	switch_majorstate(MAJORSTATE_BODY);
				407	s.formaction[0] = s.formname[0] = 0;
				408	break;
				409	case TAG_INPUT:
				410	if(s.majorstate == MAJORSTATE_FORM) {
				411	/* First check if we are called at the end of an input tag. If
				412	so, we should render the input widget. */
				413	if(s.tagattr[0] == 0 &&
				414	s.inputname[0] != 0) {
				415	PRINTF(("Render input type %d\n", s.inputtype));
				416	switch(s.inputtype) {
				417	case HTMLPARSER_INPUTTYPE_NONE:
				418	case HTMLPARSER_INPUTTYPE_TEXT:
				419	for(i = 0; i < s.inputvaluesize; ++i) {
				420	if(s.inputvalue[i] == 0) {
				421	memset(&s.inputvalue[i], ISO_space, s.inputvaluesize - i);
				422	s.inputvalue[s.inputvaluesize] = 0;
				423	break;
				424	}
				425	}
				426	htmlparser_inputfield(s.inputvalue, s.inputname,
				427	s.formname, s.formaction);
				428	break;
				429	case HTMLPARSER_INPUTTYPE_SUBMIT:
adamdunkels	fe6cd59	2003-08-09 13:29:53 +0000	[diff] [blame^]	430	case HTMLPARSER_INPUTTYPE_IMAGE:
adamdunkels	ca9ddcb	2003-03-19 14:13:31 +0000	[diff] [blame]	431	htmlparser_submitbutton(s.inputvalue, s.inputname,
				432	s.formname, s.formaction);
				433	break;
				434	}
				435	s.inputtype = HTMLPARSER_INPUTTYPE_NONE;
				436	} else {
				437	PRINTF(("Input '%s' '%s'\n", s.tagattr, s.tagattrparam));
				438	if(strncmp(s.tagattr, html_type, sizeof(html_type)) == 0) {
				439	if(strncmp(s.tagattrparam, html_submit,
				440	sizeof(html_submit)) == 0) {
				441	s.inputtype = HTMLPARSER_INPUTTYPE_SUBMIT;
				442	} else if(strncmp(s.tagattrparam, html_image,
				443	sizeof(html_image)) == 0) {
				444	s.inputtype = HTMLPARSER_INPUTTYPE_IMAGE;
				445	} else if(strncmp(s.tagattrparam, html_text,
				446	sizeof(html_text)) == 0) {
				447	s.inputtype = HTMLPARSER_INPUTTYPE_TEXT;
				448	} else {
				449	s.inputtype = HTMLPARSER_INPUTTYPE_OTHER;
				450	}
				451	} else if(strncmp(s.tagattr, html_name,
				452	sizeof(html_name)) == 0) {
				453	strncpy(s.inputname, s.tagattrparam,
				454	WWW_CONF_MAX_INPUTNAMELEN);
				455	} else if(strncmp(s.tagattr, html_alt,
				456	sizeof(html_alt)) == 0 &&
				457	s.inputtype == HTMLPARSER_INPUTTYPE_IMAGE) {
				458	strncpy(s.inputvalue, s.tagattrparam,
				459	WWW_CONF_MAX_INPUTVALUELEN);
				460	} else if(strncmp(s.tagattr, html_value,
				461	sizeof(html_value)) == 0) {
				462	strncpy(s.inputvalue, s.tagattrparam,
				463	WWW_CONF_MAX_INPUTVALUELEN);
				464	} else if(strncmp(s.tagattr, html_size,
				465	sizeof(html_size)) == 0) {
				466	size = 0;
				467	if(s.tagattrparam[0] >= '0' &&
				468	s.tagattrparam[0] <= '9') {
				469	size = s.tagattrparam[0] - '0';
				470	if(s.tagattrparam[1] >= '0' &&
				471	s.tagattrparam[1] <= '9') {
				472	size = size * 10 + (s.tagattrparam[1] - '0');
				473	}
				474	}
				475	if(size >= WWW_CONF_MAX_INPUTVALUELEN) {
				476	size = WWW_CONF_MAX_INPUTVALUELEN - 1;
				477	}
				478	s.inputvaluesize = size;
				479	/* strncpy(s.inputvalue, s.tagattrparam,
				480	WWW_CONF_MAX_INPUTVALUELEN);*/
				481	}
				482	}
				483
				484	}
				485	break;
				486	#endif /* WWW_CONF_FORMS */
				487	#if WWW_CONF_RENDERSTATE
				488	case TAG_CENTER:
				489	parse_char(ISO_nl);
				490	htmlparser_renderstate(HTMLPARSER_RENDERSTATE_BEGIN \|
				491	HTMLPARSER_RENDERSTATE_CENTER);
				492	break;
				493	case TAG_SLASHCENTER:
				494	parse_char(ISO_nl);
				495	htmlparser_renderstate(HTMLPARSER_RENDERSTATE_END \|
				496	HTMLPARSER_RENDERSTATE_CENTER);
				497	break;
				498	#endif /* WWW_CONF_RENDERSTATE */
				499	}
				500	}
				501	/-----------------------------------------------------------------------------------/
				502	void
				503	htmlparser_init(void)
				504	{
				505	s.majorstate = s.lastmajorstate = MAJORSTATE_DISCARD;
				506	s.minorstate = MINORSTATE_TEXT;
				507	s.lastchar = 0;
				508	}
				509	/-----------------------------------------------------------------------------------/
adamdunkels	9d1aaef	2003-04-05 12:21:37 +0000	[diff] [blame]	510	static char CC_FASTCALL
adamdunkels	ca9ddcb	2003-03-19 14:13:31 +0000	[diff] [blame]	511	lowercase(char c)
				512	{
				513	/* XXX: This is a brute force approach to lower-case
				514	converting and should not be used anywhere else! It
				515	works for our purposes, however (i.e., HTML tags). */
				516	if(c > 0x40) {
				517	return (c & 0x1f) \| 0x60;
				518	} else {
				519	return c;
				520	}
				521	}
				522	/-----------------------------------------------------------------------------------/
				523	static void
				524	endtagfound(void)
				525	{
				526	s.tag[s.tagptr] = 0;
				527	s.tagattr[s.tagattrptr] = 0;
				528	s.tagattrparam[s.tagattrparamptr] = 0;
				529	}
				530	/-----------------------------------------------------------------------------------/
				531	/* htmlparser_parse():
				532	*
				533	* This is the main function in the HTML parser module and it parses
				534	* the HTML data in the input buffer. The htmlparser_state is updated
				535	* as the buffer is parsed character by character. The functions
				536	* parse_char() and parse_tag() (defined earlier in this file) are
				537	* called to process regular characters and HTML tags,
				538	* respectively.
				539	*
				540	* Note that the input buffer does not have to contain full HTML tags;
				541	* the parser is state machine driven in order to be able to work with
				542	* buffers that have been divided in any way.
				543	*/
				544	void
				545	htmlparser_parse(char *data, u16_t len)
				546	{
				547	static char c;
				548
				549	while(len > 0) {
				550	c = *data;
				551	--len;
				552	++data;
				553
				554	switch(s.minorstate) {
				555	case MINORSTATE_NONE:
				556	break;
				557	case MINORSTATE_TEXT:
				558	/* We are currently parsing some text, so we look for signs of
				559	an HTML tag starting (i.e., a '<' character). We also
				560	compress any whitespace character to one single space
				561	character (' '). */
				562	if(c == ISO_lt) {
				563	s.minorstate = MINORSTATE_TAG;
				564	s.tagptr = 0;
				565	endtagfound();
				566	} else if(c == ISO_ampersand) {
				567	s.minorstate = MINORSTATE_EXTCHAR;
				568	} else {
				569	if(iswhitespace(c)) {
				570	if(s.lastchar != ISO_space) {
				571	parse_char(' ');
				572	s.lastchar = ISO_space;
				573	c = ISO_space;
				574	}
				575	} else {
				576	parse_char(c);
				577	}
				578	}
				579	break;
				580	case MINORSTATE_EXTCHAR:
				581	if(c == ISO_semicolon) {
				582	s.minorstate = MINORSTATE_TEXT;
				583	parse_char(' ');
				584	} else if(iswhitespace(c)) {
				585	s.minorstate = MINORSTATE_TEXT;
				586	parse_char('&');
				587	parse_char(' ');
				588	}
				589	break;
				590	case MINORSTATE_TAG:
				591	/* We are currently parsing within the name of a tag. We check
				592	for the end of a tag (the '>' character) or whitespace (which
				593	indicates that we should parse a tag attr argument
				594	instead). */
				595	if(c == ISO_gt) {
				596	/* Full tag found. We continue parsing regular text. */
				597	s.minorstate = MINORSTATE_TEXT;
				598	s.tagattrptr = s.tagattrparamptr = 0;
				599	endtagfound();
				600	parse_tag();
				601	} else if(iswhitespace(c)) {
				602	/* The name of the tag found. We continue parsing the tag
				603	attr.*/
				604	s.minorstate = MINORSTATE_TAGATTR;
				605	s.tagattrptr = 0;
				606	endtagfound();
				607	} else {
				608	/* Keep track of the name of the tag, but convert it to
				609	lower case. */
				610
				611	s.tag[s.tagptr] = lowercase(c);
				612	++s.tagptr;
				613	/* Check if the ->tag field is full. If so, we just eat up
				614	any data left in the tag. */
				615	if(s.tagptr == sizeof(s.tag)) {
				616	s.minorstate = MINORSTATE_TAGEND;
				617	}
				618	}
				619
				620	/* Check for HTML comment, indicated by <!-- */
				621	if(s.tagptr == 3 &&
				622	s.tag[0] == ISO_bang &&
				623	s.tag[1] == ISO_dash &&
				624	s.tag[2] == ISO_dash) {
				625	PRINTF(("Starting comment...\n"));
				626	s.minorstate = MINORSTATE_HTMLCOMMENT;
				627	s.tagptr = 0;
				628	endtagfound();
				629	}
				630	break;
				631	case MINORSTATE_TAGATTR:
				632	/* We parse the "tag attr", i.e., the "href" in <a
				633	href="...">. */
				634	if(c == ISO_gt) {
				635	/* Full tag found. */
				636	s.minorstate = MINORSTATE_TEXT;
				637	s.tagattrparamptr = 0;
				638	s.tagattrptr = 0;
				639	endtagfound();
				640	parse_tag();
				641	s.tagptr = 0;
				642	endtagfound();
				643
				644	} else if(iswhitespace(c)) {
				645	if(s.tagattrptr == 0) {
				646	/* Discard leading spaces. */
				647	} else {
				648	/* A non-leading space is the end of the attribute. */
				649	s.tagattrparamptr = 0;
				650	endtagfound();
				651	parse_tag();
				652	s.minorstate = MINORSTATE_TAGATTRSPACE;
				653	/* s.tagattrptr = 0;
				654	endtagfound();*/
				655	}
				656	} else if(c == ISO_eq) {
				657	s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
				658	s.tagattrparamptr = 0;
				659	endtagfound();
				660	} else {
				661	s.tagattr[s.tagattrptr] = lowercase(c);
				662	++s.tagattrptr;
				663	/* Check if the "tagattr" field is full. If so, we just eat
				664	up any data left in the tag. */
				665	if(s.tagattrptr == sizeof(s.tagattr)) {
				666	s.minorstate = MINORSTATE_TAGEND;
				667	}
				668	}
				669	break;
				670	case MINORSTATE_TAGATTRSPACE:
				671	if(iswhitespace(c)) {
				672	/* Discard spaces. */
				673	} else if(c == ISO_eq) {
				674	s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
				675	s.tagattrparamptr = 0;
				676	endtagfound();
				677	parse_tag();
				678	} else {
				679	s.tagattr[0] = lowercase(c);
				680	s.tagattrptr = 1;
				681	s.minorstate = MINORSTATE_TAGATTR;
				682	}
				683	break;
				684	case MINORSTATE_TAGATTRPARAMNQ:
				685	/* We are parsing the "tag attr parameter", i.e., the link part
				686	in <a href="link">. */
				687	if(c == ISO_gt) {
				688	/* Full tag found. */
				689	endtagfound();
				690	parse_tag();
				691	s.minorstate = MINORSTATE_TEXT;
				692	s.tagattrptr = 0;
				693	endtagfound();
				694	parse_tag();
				695	s.tagptr = 0;
				696	endtagfound();
				697	} else if(iswhitespace(c) &&
				698	s.tagattrparamptr == 0) {
				699	/* Discard leading spaces. */
				700	} else if((c == ISO_citation \|\|
				701	c == ISO_citation2) &&
				702	s.tagattrparamptr == 0) {
				703	s.minorstate = MINORSTATE_TAGATTRPARAM;
				704	s.quotechar = c;
				705	PRINTF(("tag attr param q found\n"));
				706	} else if(iswhitespace(c)) {
				707	PRINTF(("Non-leading space found at %d\n",
				708	s.tagattrparamptr));
				709	/* Stop parsing if a non-leading space was found */
				710	endtagfound();
				711	parse_tag();
				712
				713	s.minorstate = MINORSTATE_TAGATTR;
				714	s.tagattrptr = 0;
				715	endtagfound();
				716	} else {
				717	s.tagattrparam[s.tagattrparamptr] = c;
				718	++s.tagattrparamptr;
				719	/* Check if the "tagattr" field is full. If so, we just eat
				720	up any data left in the tag. */
				721	if(s.tagattrparamptr >= sizeof(s.tagattrparam) - 1) {
				722	s.minorstate = MINORSTATE_TAGEND;
				723	}
				724	}
				725
				726	break;
				727	case MINORSTATE_TAGATTRPARAM:
				728	/* We are parsing the "tag attr parameter", i.e., the link
				729	part in <a href="link">. */
				730	if(c == s.quotechar) {
				731	/* Found end of tag attr parameter. */
				732	endtagfound();
				733	parse_tag();
				734
				735	s.minorstate = MINORSTATE_TAGATTR;
				736	s.tagattrptr = 0;
				737	endtagfound();
				738	} else {
				739	if(iswhitespace(c)) {
				740	c = ISO_space;
				741	}
				742	s.tagattrparam[s.tagattrparamptr] = c;
				743	++s.tagattrparamptr;
				744	/* Check if the "tagattr" field is full. If so, we just eat
				745	up any data left in the tag. */
				746	if(s.tagattrparamptr >= sizeof(s.tagattrparam) - 1) {
				747	s.minorstate = MINORSTATE_TAGEND;
				748	}
				749	}
				750
				751	break;
				752	case MINORSTATE_HTMLCOMMENT:
				753	if(c == ISO_dash) {
				754	++s.tagptr;
				755	} else if(c == ISO_gt && s.tagptr > 0) {
				756	PRINTF(("Comment done.\n"));
				757	s.minorstate = MINORSTATE_TEXT;
				758	} else {
				759	s.tagptr = 0;
				760	}
				761	break;
				762	case MINORSTATE_TAGEND:
				763	/* Discard characters until a '>' is seen. */
				764	if(c == ISO_gt) {
				765	s.minorstate = MINORSTATE_TEXT;
				766	s.tagattrptr = 0;
				767	endtagfound();
				768	parse_tag();
				769	}
				770	break;
				771	}
				772
				773	s.lastchar = c;
				774	}
				775	}
				776	/-----------------------------------------------------------------------------------/