blob: cb48841f30b287913f62e23776c950e5a8f51ae9 [file] [log] [blame]
adamdunkelsca9ddcb2003-03-19 14:13:31 +00001/*
2 * Copyright (c) 2002, Adam Dunkels.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following
12 * disclaimer in the documentation and/or other materials provided
13 * with the distribution.
14 * 3. All advertising materials mentioning features or use of this
15 * software must display the following acknowledgement:
16 * This product includes software developed by Adam Dunkels.
17 * 4. The name of the author may not be used to endorse or promote
18 * products derived from this software without specific prior
19 * written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
22 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
25 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * This file is part of the Contiki desktop environment
34 *
adamdunkelsfe6cd592003-08-09 13:29:53 +000035 * $Id: htmlparser.c,v 1.3 2003/08/09 13:29:53 adamdunkels Exp $
adamdunkelsca9ddcb2003-03-19 14:13:31 +000036 *
37 */
38
39/* htmlparser.c:
40 *
41 * Implements a very simplistic HTML parser. It recognizes HTML links
42 * (<a href>-tags), HTML img alt tags, a few text flow break tags
43G * (<br>, <p>, <h>), the <li> tag (but does not even try to
44 * distinguish between <ol> or <ul>) as well as HTML comment tags
45 * (<!-- -->).
46 *
47 * To save memory, the HTML parser is state machine driver, which
48 * means that it will shave off one character from the HTML page,
49 * process that character, and return to the next. Another way of
50 * doing it would be to buffer a number of characters and process them
51 * together.
52 *
53 * The main function in this file is the htmlparser_parse() function
54 * which takes a htmlparser_state structur and a part of an HTML file
55 * as an argument. The htmlparser_parse() function will call the
56 * helper functions parse_char() and parse_tag(). Those functions will
57 * in turn call the two callback functions htmlparser_char() and
58 * htmlparser_tag(). Those functions must be implemented by the using
59 * module (e.g., a web browser program).
60 *
61 * htmlparser_char() will be called for every non-tag character.
62 *
63 * htmlparser_tag() will be called whenever a full tag has been found.
64 *
65 */
66
67
68#include "htmlparser.h"
69#include "html-strings.h"
70#include "www-conf.h"
adamdunkels9d1aaef2003-04-05 12:21:37 +000071#include "cc.h"
adamdunkelsca9ddcb2003-03-19 14:13:31 +000072
73#if 1
74#define PRINTF(x)
75#else
76#include <stdio.h>
77#define PRINTF(x) printf x
78#endif
79
adamdunkelsca9ddcb2003-03-19 14:13:31 +000080
81/*-----------------------------------------------------------------------------------*/
82#define ISO_A 0x41
83#define ISO_B 0x42
84#define ISO_E 0x45
85#define ISO_F 0x46
86#define ISO_G 0x47
87#define ISO_H 0x48
88#define ISO_I 0x49
89#define ISO_L 0x4c
90#define ISO_M 0x4d
91#define ISO_P 0x50
92#define ISO_R 0x52
93#define ISO_T 0x54
94
95#define ISO_a (ISO_A | 0x20)
96#define ISO_b (ISO_B | 0x20)
97#define ISO_e (ISO_E | 0x20)
98#define ISO_f (ISO_F | 0x20)
99#define ISO_g (ISO_G | 0x20)
100#define ISO_h (ISO_H | 0x20)
101#define ISO_i (ISO_I | 0x20)
102#define ISO_l (ISO_L | 0x20)
103#define ISO_m (ISO_M | 0x20)
104#define ISO_p (ISO_P | 0x20)
105#define ISO_r (ISO_R | 0x20)
106#define ISO_t (ISO_T | 0x20)
107
108#define ISO_ht 0x09
109#define ISO_nl 0x0a
110#define ISO_cr 0x0d
111#define ISO_space 0x20
112#define ISO_bang 0x21
113#define ISO_citation 0x22
114#define ISO_ampersand 0x26
115#define ISO_citation2 0x27
116#define ISO_asterisk 0x2a
117#define ISO_dash 0x2d
118#define ISO_slash 0x2f
119#define ISO_semicolon 0x3b
120#define ISO_lt 0x3c
121#define ISO_eq 0x3d
122#define ISO_gt 0x3e
123
124#define ISO_rbrack 0x5b
125#define ISO_lbrack 0x5d
126
127#define MINORSTATE_NONE 0
128#define MINORSTATE_TEXT 1 /* Parse normal text */
129#define MINORSTATE_EXTCHAR 2 /* Check for semi-colon */
130#define MINORSTATE_TAG 3 /* Check for name of tag. */
131#define MINORSTATE_TAGEND 4 /* Scan for end of tag. */
132#define MINORSTATE_TAGATTR 5 /* Parse tag attr. */
133#define MINORSTATE_TAGATTRSPACE 6 /* Parse optional space after tag
134 attr. */
135#define MINORSTATE_TAGATTRPARAM 7 /* Parse tag attr parameter. */
136#define MINORSTATE_TAGATTRPARAMNQ 8 /* Parse tag attr parameter without
137 quotation marks. */
138#define MINORSTATE_HTMLCOMMENT 9 /* Scan for HTML comment end */
139
140#define MAJORSTATE_NONE 0
141#define MAJORSTATE_BODY 1
142#define MAJORSTATE_LINK 2
143#define MAJORSTATE_FORM 3
144#define MAJORSTATE_DISCARD 4
145
146
147struct htmlparser_state {
148 unsigned char minorstate;
149 char tag[20];
150 unsigned char tagptr;
151 char tagattr[20];
152 unsigned char tagattrptr;
153 char tagattrparam[WWW_CONF_MAX_URLLEN];
154 unsigned char tagattrparamptr;
155 unsigned char lastchar, quotechar;
156 unsigned char majorstate, lastmajorstate;
157 char linkurl[WWW_CONF_MAX_URLLEN];
158 char linktext[40];
159 unsigned char linktextptr;
160#if WWW_CONF_FORMS
161 char formaction[WWW_CONF_MAX_FORMACTIONLEN];
162 char formname[WWW_CONF_MAX_FORMNAMELEN];
163 unsigned char inputtype;
164 char inputname[WWW_CONF_MAX_INPUTNAMELEN];
165 char inputvalue[WWW_CONF_MAX_INPUTVALUELEN];
166 unsigned char inputvaluesize;
167#endif /* WWW_CONF_FORMS */
168};
169
170static struct htmlparser_state s;
171
172/*-----------------------------------------------------------------------------------*/
173static char last[1] = {0xff};
174
175static char *tags[] = {
176#define TAG_FIRST 0
177#define TAG_SLASHA 0
178 html_slasha,
179#define TAG_SLASHCENTER 1
180 html_slashcenter,
181#define TAG_SLASHFORM 2
182 html_slashform,
183#define TAG_SLASHH 3
184 html_slashh,
185#define TAG_SLASHSCRIPT 4
186 html_slashscript,
187#define TAG_SLASHSELECT 5
188 html_slashselect,
189#define TAG_SLASHSTYLE 6
190 html_slashstyle,
191#define TAG_A 7
192 html_a,
193#define TAG_BODY 8
194 html_body,
195#define TAG_BR 9
196 html_br,
197#define TAG_CENTER 10
198 html_center,
199#define TAG_FORM 11
200 html_form,
201#define TAG_FRAME 12
202 html_frame,
203#define TAG_H1 13
204 html_h1,
205#define TAG_H2 14
206 html_h2,
207#define TAG_H3 15
208 html_h3,
209#define TAG_H4 16
210 html_h4,
211#define TAG_IMG 17
212 html_img,
213#define TAG_INPUT 18
214 html_input,
215#define TAG_LI 19
216 html_li,
217#define TAG_P 20
218 html_p,
219#define TAG_SCRIPT 21
220 html_script,
221#define TAG_SELECT 22
222 html_select,
223#define TAG_STYLE 23
224 html_style,
225#define TAG_TR 24
226 html_tr,
227#define TAG_LAST 25
228 last,
229};
230
231/*-----------------------------------------------------------------------------------*/
adamdunkels9d1aaef2003-04-05 12:21:37 +0000232static unsigned char CC_FASTCALL
adamdunkelsca9ddcb2003-03-19 14:13:31 +0000233iswhitespace(char c)
234{
235 return (c == ISO_space ||
236 c == ISO_nl ||
237 c == ISO_cr ||
238 c == ISO_ht);
239}
240/*-----------------------------------------------------------------------------------*/
adamdunkels9d1aaef2003-04-05 12:21:37 +0000241static unsigned char CC_FASTCALL
adamdunkelsca9ddcb2003-03-19 14:13:31 +0000242find_tag(char *tag)
243{
244 static unsigned char first, last, i, tabi;
245 static char tagc;
246
247 tabi = first = TAG_FIRST;
248 last = TAG_LAST;
249 i = 0;
250
251 do {
252 tagc = tag[i];
253
254 if(tagc == 0 &&
255 tags[first][i] == 0) {
256 return first;
257 }
258
259 /* First, find first matching tag from table. */
260 while(tagc > (tags[tabi])[i] &&
261 tabi < last) {
262 ++tabi;
263 }
264 first = tabi;
265
266 /* Second, find last matching tag from table. */
267 while(tagc == (tags[tabi])[i] &&
268 tabi < last) {
269 ++tabi;
270 }
271 last = tabi;
272
273 /* If first and last matching tags are equal, we have a match and
274 return. Else we continue with the next character. */
275 ++i;
276 tabi = first;
277 } while(last != first);
278 return TAG_LAST;
279}
280/*-----------------------------------------------------------------------------------*/
adamdunkels9d1aaef2003-04-05 12:21:37 +0000281static void CC_FASTCALL
adamdunkelsca9ddcb2003-03-19 14:13:31 +0000282parse_char(unsigned char c)
283{
284 if(c < 0x80) {
285 if(s.majorstate == MAJORSTATE_LINK) {
286 if(s.linktextptr < sizeof(s.linktext)) {
287 if(iswhitespace(c)) {
288 c = ISO_space;
289 }
290 s.linktext[s.linktextptr] = c;
291 ++s.linktextptr;
292 }
293 } else if(s.majorstate != MAJORSTATE_DISCARD) {
294 htmlparser_char(c);
295 }
296 }
297}
298/*-----------------------------------------------------------------------------------*/
adamdunkelsfe6cd592003-08-09 13:29:53 +0000299static void CC_FASTCALL
adamdunkelsca9ddcb2003-03-19 14:13:31 +0000300switch_majorstate(unsigned char newstate)
301{
302 if(s.majorstate != newstate) {
303 PRINTF(("Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate));
304 s.lastmajorstate = s.majorstate;
305 s.majorstate = newstate;
306 }
307}
308/*-----------------------------------------------------------------------------------*/
309static void
310parse_tag(void)
311{
312 static char *tagattrparam;
313 static unsigned char size, i;
314
315 PRINTF(("Parsing tag '%s' '%s' '%s'\n",
316 s.tag, s.tagattr, s.tagattrparam));
317
318 switch(find_tag(s.tag)) {
319 case TAG_P:
320 case TAG_H1:
321 case TAG_H2:
322 case TAG_H3:
323 case TAG_H4:
324 parse_char(ISO_nl);
325 /* FALLTHROUGH */
326 case TAG_BR:
327 case TAG_TR:
328 case TAG_SLASHH:
329 parse_char(ISO_nl);
330 break;
331 case TAG_LI:
332 parse_char(ISO_nl);
333 parse_char(ISO_asterisk);
334 parse_char(ISO_space);
335 break;
336 case TAG_SCRIPT:
337 case TAG_STYLE:
338 case TAG_SELECT:
339 switch_majorstate(MAJORSTATE_DISCARD);
340 break;
341 case TAG_SLASHSCRIPT:
342 case TAG_SLASHSTYLE:
343 case TAG_SLASHSELECT:
344 switch_majorstate(s.lastmajorstate);
345 break;
346 case TAG_BODY:
347 s.majorstate = s.lastmajorstate = MAJORSTATE_BODY;
348 break;
349 case TAG_FRAME:
350 if(strncmp(s.tagattr, html_src, sizeof(html_src)) == 0 &&
351 s.tagattrparam[0] != 0) {
352 switch_majorstate(MAJORSTATE_BODY);
353 parse_char(ISO_nl);
354 parse_char(ISO_rbrack);
355 parse_char(ISO_space);
356 htmlparser_link(html_frame, s.tagattrparam);
357 PRINTF(("Frame [%s]\n", s.tagattrparam));
358 parse_char(ISO_space);
359 parse_char(ISO_lbrack);
360 parse_char(ISO_nl);
361 }
362 break;
363 case TAG_IMG:
364 if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 &&
365 s.tagattrparam[0] != 0) {
366 parse_char(ISO_lt);
367 tagattrparam = &s.tagattrparam[0];
368 while(*tagattrparam) {
369 parse_char(*tagattrparam);
370 ++tagattrparam;
371 }
372 parse_char(ISO_gt);
373 }
374 break;
375 case TAG_A:
376 PRINTF(("A %s %s\n", s.tagattr, s.tagattrparam));
377 if(strncmp(s.tagattr, html_href, sizeof(html_href)) == 0 &&
378 s.tagattrparam[0] != 0) {
379 strcpy(s.linkurl, s.tagattrparam);
380 switch_majorstate(MAJORSTATE_LINK);
381 s.linktextptr = 0;
382 }
383 break;
384 case TAG_SLASHA:
385 if(s.majorstate == MAJORSTATE_LINK) {
386 switch_majorstate(s.lastmajorstate);
387 s.linktext[s.linktextptr] = 0;
388 htmlparser_link(s.linktext, s.linkurl);
389 PRINTF(("Link '%s' [%s]\n", s.linktext, s.linkurl));
390 }
391 break;
392#if WWW_CONF_FORMS
393 case TAG_FORM:
394 PRINTF(("Form tag\n"));
395 switch_majorstate(MAJORSTATE_FORM);
396 if(strncmp(s.tagattr, html_action, sizeof(html_action)) == 0) {
397 PRINTF(("Form action '%s'\n", s.tagattrparam));
398 strncpy(s.formaction, s.tagattrparam, WWW_CONF_MAX_FORMACTIONLEN - 1);
399 } else if(strncmp(s.tagattr, html_name, sizeof(html_name)) == 0) {
400 PRINTF(("Form name '%s'\n", s.tagattrparam));
401 strncpy(s.formname, s.tagattrparam, WWW_CONF_MAX_FORMNAMELEN - 1);
402 }
403 s.inputname[0] = s.inputvalue[0] = 0;
404 break;
405 case TAG_SLASHFORM:
406 switch_majorstate(MAJORSTATE_BODY);
407 s.formaction[0] = s.formname[0] = 0;
408 break;
409 case TAG_INPUT:
410 if(s.majorstate == MAJORSTATE_FORM) {
411 /* First check if we are called at the end of an input tag. If
412 so, we should render the input widget. */
413 if(s.tagattr[0] == 0 &&
414 s.inputname[0] != 0) {
415 PRINTF(("Render input type %d\n", s.inputtype));
416 switch(s.inputtype) {
417 case HTMLPARSER_INPUTTYPE_NONE:
418 case HTMLPARSER_INPUTTYPE_TEXT:
419 for(i = 0; i < s.inputvaluesize; ++i) {
420 if(s.inputvalue[i] == 0) {
421 memset(&s.inputvalue[i], ISO_space, s.inputvaluesize - i);
422 s.inputvalue[s.inputvaluesize] = 0;
423 break;
424 }
425 }
426 htmlparser_inputfield(s.inputvalue, s.inputname,
427 s.formname, s.formaction);
428 break;
429 case HTMLPARSER_INPUTTYPE_SUBMIT:
adamdunkelsfe6cd592003-08-09 13:29:53 +0000430 case HTMLPARSER_INPUTTYPE_IMAGE:
adamdunkelsca9ddcb2003-03-19 14:13:31 +0000431 htmlparser_submitbutton(s.inputvalue, s.inputname,
432 s.formname, s.formaction);
433 break;
434 }
435 s.inputtype = HTMLPARSER_INPUTTYPE_NONE;
436 } else {
437 PRINTF(("Input '%s' '%s'\n", s.tagattr, s.tagattrparam));
438 if(strncmp(s.tagattr, html_type, sizeof(html_type)) == 0) {
439 if(strncmp(s.tagattrparam, html_submit,
440 sizeof(html_submit)) == 0) {
441 s.inputtype = HTMLPARSER_INPUTTYPE_SUBMIT;
442 } else if(strncmp(s.tagattrparam, html_image,
443 sizeof(html_image)) == 0) {
444 s.inputtype = HTMLPARSER_INPUTTYPE_IMAGE;
445 } else if(strncmp(s.tagattrparam, html_text,
446 sizeof(html_text)) == 0) {
447 s.inputtype = HTMLPARSER_INPUTTYPE_TEXT;
448 } else {
449 s.inputtype = HTMLPARSER_INPUTTYPE_OTHER;
450 }
451 } else if(strncmp(s.tagattr, html_name,
452 sizeof(html_name)) == 0) {
453 strncpy(s.inputname, s.tagattrparam,
454 WWW_CONF_MAX_INPUTNAMELEN);
455 } else if(strncmp(s.tagattr, html_alt,
456 sizeof(html_alt)) == 0 &&
457 s.inputtype == HTMLPARSER_INPUTTYPE_IMAGE) {
458 strncpy(s.inputvalue, s.tagattrparam,
459 WWW_CONF_MAX_INPUTVALUELEN);
460 } else if(strncmp(s.tagattr, html_value,
461 sizeof(html_value)) == 0) {
462 strncpy(s.inputvalue, s.tagattrparam,
463 WWW_CONF_MAX_INPUTVALUELEN);
464 } else if(strncmp(s.tagattr, html_size,
465 sizeof(html_size)) == 0) {
466 size = 0;
467 if(s.tagattrparam[0] >= '0' &&
468 s.tagattrparam[0] <= '9') {
469 size = s.tagattrparam[0] - '0';
470 if(s.tagattrparam[1] >= '0' &&
471 s.tagattrparam[1] <= '9') {
472 size = size * 10 + (s.tagattrparam[1] - '0');
473 }
474 }
475 if(size >= WWW_CONF_MAX_INPUTVALUELEN) {
476 size = WWW_CONF_MAX_INPUTVALUELEN - 1;
477 }
478 s.inputvaluesize = size;
479 /* strncpy(s.inputvalue, s.tagattrparam,
480 WWW_CONF_MAX_INPUTVALUELEN);*/
481 }
482 }
483
484 }
485 break;
486#endif /* WWW_CONF_FORMS */
487#if WWW_CONF_RENDERSTATE
488 case TAG_CENTER:
489 parse_char(ISO_nl);
490 htmlparser_renderstate(HTMLPARSER_RENDERSTATE_BEGIN |
491 HTMLPARSER_RENDERSTATE_CENTER);
492 break;
493 case TAG_SLASHCENTER:
494 parse_char(ISO_nl);
495 htmlparser_renderstate(HTMLPARSER_RENDERSTATE_END |
496 HTMLPARSER_RENDERSTATE_CENTER);
497 break;
498#endif /* WWW_CONF_RENDERSTATE */
499 }
500}
501/*-----------------------------------------------------------------------------------*/
502void
503htmlparser_init(void)
504{
505 s.majorstate = s.lastmajorstate = MAJORSTATE_DISCARD;
506 s.minorstate = MINORSTATE_TEXT;
507 s.lastchar = 0;
508}
509/*-----------------------------------------------------------------------------------*/
adamdunkels9d1aaef2003-04-05 12:21:37 +0000510static char CC_FASTCALL
adamdunkelsca9ddcb2003-03-19 14:13:31 +0000511lowercase(char c)
512{
513 /* XXX: This is a *brute force* approach to lower-case
514 converting and should *not* be used anywhere else! It
515 works for our purposes, however (i.e., HTML tags). */
516 if(c > 0x40) {
517 return (c & 0x1f) | 0x60;
518 } else {
519 return c;
520 }
521}
522/*-----------------------------------------------------------------------------------*/
523static void
524endtagfound(void)
525{
526 s.tag[s.tagptr] = 0;
527 s.tagattr[s.tagattrptr] = 0;
528 s.tagattrparam[s.tagattrparamptr] = 0;
529}
530/*-----------------------------------------------------------------------------------*/
531/* htmlparser_parse():
532 *
533 * This is the main function in the HTML parser module and it parses
534 * the HTML data in the input buffer. The htmlparser_state is updated
535 * as the buffer is parsed character by character. The functions
536 * parse_char() and parse_tag() (defined earlier in this file) are
537 * called to process regular characters and HTML tags,
538 * respectively.
539 *
540 * Note that the input buffer does not have to contain full HTML tags;
541 * the parser is state machine driven in order to be able to work with
542 * buffers that have been divided in any way.
543 */
544void
545htmlparser_parse(char *data, u16_t len)
546{
547 static char c;
548
549 while(len > 0) {
550 c = *data;
551 --len;
552 ++data;
553
554 switch(s.minorstate) {
555 case MINORSTATE_NONE:
556 break;
557 case MINORSTATE_TEXT:
558 /* We are currently parsing some text, so we look for signs of
559 an HTML tag starting (i.e., a '<' character). We also
560 compress any whitespace character to one single space
561 character (' '). */
562 if(c == ISO_lt) {
563 s.minorstate = MINORSTATE_TAG;
564 s.tagptr = 0;
565 endtagfound();
566 } else if(c == ISO_ampersand) {
567 s.minorstate = MINORSTATE_EXTCHAR;
568 } else {
569 if(iswhitespace(c)) {
570 if(s.lastchar != ISO_space) {
571 parse_char(' ');
572 s.lastchar = ISO_space;
573 c = ISO_space;
574 }
575 } else {
576 parse_char(c);
577 }
578 }
579 break;
580 case MINORSTATE_EXTCHAR:
581 if(c == ISO_semicolon) {
582 s.minorstate = MINORSTATE_TEXT;
583 parse_char(' ');
584 } else if(iswhitespace(c)) {
585 s.minorstate = MINORSTATE_TEXT;
586 parse_char('&');
587 parse_char(' ');
588 }
589 break;
590 case MINORSTATE_TAG:
591 /* We are currently parsing within the name of a tag. We check
592 for the end of a tag (the '>' character) or whitespace (which
593 indicates that we should parse a tag attr argument
594 instead). */
595 if(c == ISO_gt) {
596 /* Full tag found. We continue parsing regular text. */
597 s.minorstate = MINORSTATE_TEXT;
598 s.tagattrptr = s.tagattrparamptr = 0;
599 endtagfound();
600 parse_tag();
601 } else if(iswhitespace(c)) {
602 /* The name of the tag found. We continue parsing the tag
603 attr.*/
604 s.minorstate = MINORSTATE_TAGATTR;
605 s.tagattrptr = 0;
606 endtagfound();
607 } else {
608 /* Keep track of the name of the tag, but convert it to
609 lower case. */
610
611 s.tag[s.tagptr] = lowercase(c);
612 ++s.tagptr;
613 /* Check if the ->tag field is full. If so, we just eat up
614 any data left in the tag. */
615 if(s.tagptr == sizeof(s.tag)) {
616 s.minorstate = MINORSTATE_TAGEND;
617 }
618 }
619
620 /* Check for HTML comment, indicated by <!-- */
621 if(s.tagptr == 3 &&
622 s.tag[0] == ISO_bang &&
623 s.tag[1] == ISO_dash &&
624 s.tag[2] == ISO_dash) {
625 PRINTF(("Starting comment...\n"));
626 s.minorstate = MINORSTATE_HTMLCOMMENT;
627 s.tagptr = 0;
628 endtagfound();
629 }
630 break;
631 case MINORSTATE_TAGATTR:
632 /* We parse the "tag attr", i.e., the "href" in <a
633 href="...">. */
634 if(c == ISO_gt) {
635 /* Full tag found. */
636 s.minorstate = MINORSTATE_TEXT;
637 s.tagattrparamptr = 0;
638 s.tagattrptr = 0;
639 endtagfound();
640 parse_tag();
641 s.tagptr = 0;
642 endtagfound();
643
644 } else if(iswhitespace(c)) {
645 if(s.tagattrptr == 0) {
646 /* Discard leading spaces. */
647 } else {
648 /* A non-leading space is the end of the attribute. */
649 s.tagattrparamptr = 0;
650 endtagfound();
651 parse_tag();
652 s.minorstate = MINORSTATE_TAGATTRSPACE;
653 /* s.tagattrptr = 0;
654 endtagfound();*/
655 }
656 } else if(c == ISO_eq) {
657 s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
658 s.tagattrparamptr = 0;
659 endtagfound();
660 } else {
661 s.tagattr[s.tagattrptr] = lowercase(c);
662 ++s.tagattrptr;
663 /* Check if the "tagattr" field is full. If so, we just eat
664 up any data left in the tag. */
665 if(s.tagattrptr == sizeof(s.tagattr)) {
666 s.minorstate = MINORSTATE_TAGEND;
667 }
668 }
669 break;
670 case MINORSTATE_TAGATTRSPACE:
671 if(iswhitespace(c)) {
672 /* Discard spaces. */
673 } else if(c == ISO_eq) {
674 s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
675 s.tagattrparamptr = 0;
676 endtagfound();
677 parse_tag();
678 } else {
679 s.tagattr[0] = lowercase(c);
680 s.tagattrptr = 1;
681 s.minorstate = MINORSTATE_TAGATTR;
682 }
683 break;
684 case MINORSTATE_TAGATTRPARAMNQ:
685 /* We are parsing the "tag attr parameter", i.e., the link part
686 in <a href="link">. */
687 if(c == ISO_gt) {
688 /* Full tag found. */
689 endtagfound();
690 parse_tag();
691 s.minorstate = MINORSTATE_TEXT;
692 s.tagattrptr = 0;
693 endtagfound();
694 parse_tag();
695 s.tagptr = 0;
696 endtagfound();
697 } else if(iswhitespace(c) &&
698 s.tagattrparamptr == 0) {
699 /* Discard leading spaces. */
700 } else if((c == ISO_citation ||
701 c == ISO_citation2) &&
702 s.tagattrparamptr == 0) {
703 s.minorstate = MINORSTATE_TAGATTRPARAM;
704 s.quotechar = c;
705 PRINTF(("tag attr param q found\n"));
706 } else if(iswhitespace(c)) {
707 PRINTF(("Non-leading space found at %d\n",
708 s.tagattrparamptr));
709 /* Stop parsing if a non-leading space was found */
710 endtagfound();
711 parse_tag();
712
713 s.minorstate = MINORSTATE_TAGATTR;
714 s.tagattrptr = 0;
715 endtagfound();
716 } else {
717 s.tagattrparam[s.tagattrparamptr] = c;
718 ++s.tagattrparamptr;
719 /* Check if the "tagattr" field is full. If so, we just eat
720 up any data left in the tag. */
721 if(s.tagattrparamptr >= sizeof(s.tagattrparam) - 1) {
722 s.minorstate = MINORSTATE_TAGEND;
723 }
724 }
725
726 break;
727 case MINORSTATE_TAGATTRPARAM:
728 /* We are parsing the "tag attr parameter", i.e., the link
729 part in <a href="link">. */
730 if(c == s.quotechar) {
731 /* Found end of tag attr parameter. */
732 endtagfound();
733 parse_tag();
734
735 s.minorstate = MINORSTATE_TAGATTR;
736 s.tagattrptr = 0;
737 endtagfound();
738 } else {
739 if(iswhitespace(c)) {
740 c = ISO_space;
741 }
742 s.tagattrparam[s.tagattrparamptr] = c;
743 ++s.tagattrparamptr;
744 /* Check if the "tagattr" field is full. If so, we just eat
745 up any data left in the tag. */
746 if(s.tagattrparamptr >= sizeof(s.tagattrparam) - 1) {
747 s.minorstate = MINORSTATE_TAGEND;
748 }
749 }
750
751 break;
752 case MINORSTATE_HTMLCOMMENT:
753 if(c == ISO_dash) {
754 ++s.tagptr;
755 } else if(c == ISO_gt && s.tagptr > 0) {
756 PRINTF(("Comment done.\n"));
757 s.minorstate = MINORSTATE_TEXT;
758 } else {
759 s.tagptr = 0;
760 }
761 break;
762 case MINORSTATE_TAGEND:
763 /* Discard characters until a '>' is seen. */
764 if(c == ISO_gt) {
765 s.minorstate = MINORSTATE_TEXT;
766 s.tagattrptr = 0;
767 endtagfound();
768 parse_tag();
769 }
770 break;
771 }
772
773 s.lastchar = c;
774 }
775}
776/*-----------------------------------------------------------------------------------*/