| /* |
| * Copyright (c) 2002, Adam Dunkels. |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following |
| * disclaimer in the documentation and/or other materials provided |
| * with the distribution. |
| * 3. The name of the author may not be used to endorse or promote |
| * products derived from this software without specific prior |
| * written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS |
| * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
| * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
| * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * |
| * This file is part of the Contiki desktop environment |
| * |
| * $Id: htmlparser.c,v 1.7 2004/09/03 09:55:22 adamdunkels Exp $ |
| * |
| */ |
| |
| /* htmlparser.c: |
| * |
| * Implements a very simplistic HTML parser. It recognizes HTML links |
| * (<a href>-tags), HTML img alt tags, a few text flow break tags |
| G * (<br>, <p>, <h>), the <li> tag (but does not even try to |
| * distinguish between <ol> or <ul>) as well as HTML comment tags |
| * (<!-- -->). |
| * |
| * To save memory, the HTML parser is state machine driver, which |
| * means that it will shave off one character from the HTML page, |
| * process that character, and return to the next. Another way of |
| * doing it would be to buffer a number of characters and process them |
| * together. |
| * |
| * The main function in this file is the htmlparser_parse() function |
| * which takes a htmlparser_state structur and a part of an HTML file |
| * as an argument. The htmlparser_parse() function will call the |
| * helper functions parse_char() and parse_tag(). Those functions will |
| * in turn call the two callback functions htmlparser_char() and |
| * htmlparser_tag(). Those functions must be implemented by the using |
| * module (e.g., a web browser program). |
| * |
| * htmlparser_char() will be called for every non-tag character. |
| * |
| * htmlparser_tag() will be called whenever a full tag has been found. |
| * |
| */ |
| |
| |
| #include "htmlparser.h" |
| #include "html-strings.h" |
| #include "www-conf.h" |
| #include "cc.h" |
| |
| #include <string.h> |
| |
| #if 1 |
| #define PRINTF(x) |
| #else |
| #include <stdio.h> |
| #define PRINTF(x) printf x |
| #endif |
| |
| |
| /*-----------------------------------------------------------------------------------*/ |
| #define ISO_A 0x41 |
| #define ISO_B 0x42 |
| #define ISO_E 0x45 |
| #define ISO_F 0x46 |
| #define ISO_G 0x47 |
| #define ISO_H 0x48 |
| #define ISO_I 0x49 |
| #define ISO_L 0x4c |
| #define ISO_M 0x4d |
| #define ISO_P 0x50 |
| #define ISO_R 0x52 |
| #define ISO_T 0x54 |
| |
| #define ISO_a (ISO_A | 0x20) |
| #define ISO_b (ISO_B | 0x20) |
| #define ISO_e (ISO_E | 0x20) |
| #define ISO_f (ISO_F | 0x20) |
| #define ISO_g (ISO_G | 0x20) |
| #define ISO_h (ISO_H | 0x20) |
| #define ISO_i (ISO_I | 0x20) |
| #define ISO_l (ISO_L | 0x20) |
| #define ISO_m (ISO_M | 0x20) |
| #define ISO_p (ISO_P | 0x20) |
| #define ISO_r (ISO_R | 0x20) |
| #define ISO_t (ISO_T | 0x20) |
| |
| #define ISO_ht 0x09 |
| #define ISO_nl 0x0a |
| #define ISO_cr 0x0d |
| #define ISO_space 0x20 |
| #define ISO_bang 0x21 |
| #define ISO_citation 0x22 |
| #define ISO_ampersand 0x26 |
| #define ISO_citation2 0x27 |
| #define ISO_asterisk 0x2a |
| #define ISO_dash 0x2d |
| #define ISO_slash 0x2f |
| #define ISO_semicolon 0x3b |
| #define ISO_lt 0x3c |
| #define ISO_eq 0x3d |
| #define ISO_gt 0x3e |
| |
| #define ISO_rbrack 0x5b |
| #define ISO_lbrack 0x5d |
| |
| #define MINORSTATE_NONE 0 |
| #define MINORSTATE_TEXT 1 /* Parse normal text */ |
| #define MINORSTATE_EXTCHAR 2 /* Check for semi-colon */ |
| #define MINORSTATE_TAG 3 /* Check for name of tag. */ |
| #define MINORSTATE_TAGEND 4 /* Scan for end of tag. */ |
| #define MINORSTATE_TAGATTR 5 /* Parse tag attr. */ |
| #define MINORSTATE_TAGATTRSPACE 6 /* Parse optional space after tag |
| attr. */ |
| #define MINORSTATE_TAGATTRPARAM 7 /* Parse tag attr parameter. */ |
| #define MINORSTATE_TAGATTRPARAMNQ 8 /* Parse tag attr parameter without |
| quotation marks. */ |
| #define MINORSTATE_HTMLCOMMENT 9 /* Scan for HTML comment end */ |
| |
| #define MAJORSTATE_NONE 0 |
| #define MAJORSTATE_BODY 1 |
| #define MAJORSTATE_LINK 2 |
| #define MAJORSTATE_FORM 3 |
| #define MAJORSTATE_DISCARD 4 |
| |
| |
| struct htmlparser_state { |
| |
| unsigned char minorstate; |
| char tag[20]; |
| unsigned char tagptr; |
| char tagattr[20]; |
| unsigned char tagattrptr; |
| char tagattrparam[WWW_CONF_MAX_URLLEN]; |
| unsigned char tagattrparamptr; |
| unsigned char lastchar, quotechar; |
| unsigned char majorstate, lastmajorstate; |
| char linkurl[WWW_CONF_MAX_URLLEN]; |
| |
| #define MAX_WORDLEN 40 |
| char word[MAX_WORDLEN]; |
| unsigned char wordlen; |
| |
| |
| #if WWW_CONF_FORMS |
| char formaction[WWW_CONF_MAX_FORMACTIONLEN]; |
| char formname[WWW_CONF_MAX_FORMNAMELEN]; |
| unsigned char inputtype; |
| char inputname[WWW_CONF_MAX_INPUTNAMELEN]; |
| char inputvalue[WWW_CONF_MAX_INPUTVALUELEN]; |
| unsigned char inputvaluesize; |
| #endif /* WWW_CONF_FORMS */ |
| }; |
| |
| static struct htmlparser_state s; |
| |
| /*-----------------------------------------------------------------------------------*/ |
| static char last[1] = {0xff}; |
| |
| static const char *tags[] = { |
| #define TAG_FIRST 0 |
| #define TAG_SLASHA 0 |
| html_slasha, |
| #define TAG_SLASHCENTER 1 |
| html_slashcenter, |
| #define TAG_SLASHFORM 2 |
| html_slashform, |
| #define TAG_SLASHH 3 |
| html_slashh, |
| #define TAG_SLASHSCRIPT 4 |
| html_slashscript, |
| #define TAG_SLASHSELECT 5 |
| html_slashselect, |
| #define TAG_SLASHSTYLE 6 |
| html_slashstyle, |
| #define TAG_A 7 |
| html_a, |
| #define TAG_BODY 8 |
| html_body, |
| #define TAG_BR 9 |
| html_br, |
| #define TAG_CENTER 10 |
| html_center, |
| #define TAG_FORM 11 |
| html_form, |
| #define TAG_FRAME 12 |
| html_frame, |
| #define TAG_H1 13 |
| html_h1, |
| #define TAG_H2 14 |
| html_h2, |
| #define TAG_H3 15 |
| html_h3, |
| #define TAG_H4 16 |
| html_h4, |
| #define TAG_IMG 17 |
| html_img, |
| #define TAG_INPUT 18 |
| html_input, |
| #define TAG_LI 19 |
| html_li, |
| #define TAG_P 20 |
| html_p, |
| #define TAG_SCRIPT 21 |
| html_script, |
| #define TAG_SELECT 22 |
| html_select, |
| #define TAG_STYLE 23 |
| html_style, |
| #define TAG_TR 24 |
| html_tr, |
| #define TAG_LAST 25 |
| last, |
| }; |
| |
| /*-----------------------------------------------------------------------------------*/ |
| static unsigned char CC_FASTCALL |
| iswhitespace(char c) |
| { |
| return (c == ISO_space || |
| c == ISO_nl || |
| c == ISO_cr || |
| c == ISO_ht); |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| void |
| htmlparser_init(void) |
| { |
| s.majorstate = s.lastmajorstate = MAJORSTATE_DISCARD; |
| s.minorstate = MINORSTATE_TEXT; |
| s.lastchar = 0; |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| static char CC_FASTCALL |
| lowercase(char c) |
| { |
| /* XXX: This is a *brute force* approach to lower-case |
| converting and should *not* be used anywhere else! It |
| works for our purposes, however (i.e., HTML tags). */ |
| if(c > 0x40) { |
| return (c & 0x1f) | 0x60; |
| } else { |
| return c; |
| } |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| static void |
| endtagfound(void) |
| { |
| s.tag[s.tagptr] = 0; |
| s.tagattr[s.tagattrptr] = 0; |
| s.tagattrparam[s.tagattrparamptr] = 0; |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| static void CC_FASTCALL |
| switch_majorstate(unsigned char newstate) |
| { |
| if(s.majorstate != newstate) { |
| PRINTF(("Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate)); |
| s.lastmajorstate = s.majorstate; |
| s.majorstate = newstate; |
| } |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| static void CC_FASTCALL |
| add_char(unsigned char c) |
| { |
| if(s.wordlen < MAX_WORDLEN && |
| c < 0x80) { |
| s.word[s.wordlen] = c; |
| ++s.wordlen; |
| if(s.wordlen == MAX_WORDLEN) { |
| s.wordlen = MAX_WORDLEN - 1; |
| } |
| } |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| static void |
| do_word(void) |
| { |
| if(s.wordlen > 0) { |
| if(s.majorstate == MAJORSTATE_LINK) { |
| if(s.word[s.wordlen] != ISO_space) { |
| add_char(ISO_space); |
| } |
| } else if(s.majorstate == MAJORSTATE_DISCARD) { |
| s.wordlen = 0; |
| } else { |
| s.word[s.wordlen] = '\0'; |
| htmlparser_word(s.word, s.wordlen); |
| s.wordlen = 0; |
| } |
| } |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| static void |
| newline(void) |
| { |
| do_word(); |
| htmlparser_newline(); |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| static unsigned char CC_FASTCALL |
| find_tag(char *tag) |
| { |
| static unsigned char first, last, i, tabi; |
| static char tagc; |
| |
| first = TAG_FIRST; |
| last = TAG_LAST; |
| i = 0; |
| |
| do { |
| tagc = tag[i]; |
| |
| if(tagc == 0 && |
| tags[first][i] == 0) { |
| return first; |
| } |
| |
| tabi = first; |
| |
| /* First, find first matching tag from table. */ |
| while(tagc > (tags[tabi])[i] && |
| tabi < last) { |
| ++tabi; |
| } |
| first = tabi; |
| |
| /* Second, find last matching tag from table. */ |
| while(tagc == (tags[tabi])[i] && |
| tabi < last) { |
| ++tabi; |
| } |
| last = tabi; |
| |
| /* If first and last matching tags are equal, we have a non-match |
| and return. Else we continue with the next character. */ |
| ++i; |
| |
| } while(last != first); |
| return TAG_LAST; |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| static void |
| parse_tag(void) |
| { |
| static char *tagattrparam; |
| static unsigned char size, i; |
| |
| static char dummy; |
| |
| PRINTF(("Parsing tag '%s' '%s' '%s'\n", |
| s.tag, s.tagattr, s.tagattrparam)); |
| |
| switch(find_tag(s.tag)) { |
| case TAG_P: |
| case TAG_H1: |
| case TAG_H2: |
| case TAG_H3: |
| case TAG_H4: |
| /* parse_char(ISO_nl);*/ |
| newline(); |
| /* FALLTHROUGH */ |
| case TAG_BR: |
| case TAG_TR: |
| case TAG_SLASHH: |
| /* parse_char(ISO_nl);*/ |
| dummy = 0; |
| newline(); |
| break; |
| case TAG_LI: |
| newline(); |
| add_char(ISO_asterisk); |
| add_char(ISO_space); |
| break; |
| case TAG_SCRIPT: |
| case TAG_STYLE: |
| case TAG_SELECT: |
| switch_majorstate(MAJORSTATE_DISCARD); |
| break; |
| case TAG_SLASHSCRIPT: |
| case TAG_SLASHSTYLE: |
| case TAG_SLASHSELECT: |
| switch_majorstate(s.lastmajorstate); |
| break; |
| case TAG_BODY: |
| s.majorstate = s.lastmajorstate = MAJORSTATE_BODY; |
| break; |
| case TAG_FRAME: |
| if(strncmp(s.tagattr, html_src, sizeof(html_src)) == 0 && |
| s.tagattrparam[0] != 0) { |
| switch_majorstate(MAJORSTATE_BODY); |
| newline(); |
| add_char(ISO_rbrack); |
| do_word(); |
| htmlparser_link((char *)html_frame, strlen(html_frame), s.tagattrparam); |
| PRINTF(("Frame [%s]\n", s.tagattrparam)); |
| add_char(ISO_lbrack); |
| newline(); |
| } |
| break; |
| case TAG_IMG: |
| if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 && |
| s.tagattrparam[0] != 0) { |
| /* parse_char(ISO_lt);*/ |
| add_char(ISO_lt); |
| tagattrparam = &s.tagattrparam[0]; |
| while(*tagattrparam) { |
| /* parse_char(*tagattrparam);*/ |
| add_char(*tagattrparam); |
| ++tagattrparam; |
| } |
| /* parse_char(ISO_gt);*/ |
| add_char(ISO_gt); |
| do_word(); |
| } |
| break; |
| case TAG_A: |
| PRINTF(("A %s %s\n", s.tagattr, s.tagattrparam)); |
| if(strncmp(s.tagattr, html_href, sizeof(html_href)) == 0 && |
| s.tagattrparam[0] != 0) { |
| strcpy(s.linkurl, s.tagattrparam); |
| do_word(); |
| switch_majorstate(MAJORSTATE_LINK); |
| } |
| break; |
| case TAG_SLASHA: |
| if(s.majorstate == MAJORSTATE_LINK) { |
| switch_majorstate(s.lastmajorstate); |
| s.word[s.wordlen] = 0; |
| htmlparser_link(s.word, s.wordlen, s.linkurl); |
| s.wordlen = 0; |
| } |
| break; |
| #if WWW_CONF_FORMS |
| case TAG_FORM: |
| PRINTF(("Form tag\n")); |
| switch_majorstate(MAJORSTATE_FORM); |
| if(strncmp(s.tagattr, html_action, sizeof(html_action)) == 0) { |
| PRINTF(("Form action '%s'\n", s.tagattrparam)); |
| strncpy(s.formaction, s.tagattrparam, WWW_CONF_MAX_FORMACTIONLEN - 1); |
| } else if(strncmp(s.tagattr, html_name, sizeof(html_name)) == 0) { |
| PRINTF(("Form name '%s'\n", s.tagattrparam)); |
| strncpy(s.formname, s.tagattrparam, WWW_CONF_MAX_FORMNAMELEN - 1); |
| } |
| s.inputname[0] = s.inputvalue[0] = 0; |
| break; |
| case TAG_SLASHFORM: |
| switch_majorstate(MAJORSTATE_BODY); |
| s.formaction[0] = s.formname[0] = 0; |
| break; |
| case TAG_INPUT: |
| if(s.majorstate == MAJORSTATE_FORM) { |
| /* First check if we are called at the end of an input tag. If |
| so, we should render the input widget. */ |
| if(s.tagattr[0] == 0 && |
| s.inputname[0] != 0) { |
| PRINTF(("Render input type %d\n", s.inputtype)); |
| switch(s.inputtype) { |
| case HTMLPARSER_INPUTTYPE_NONE: |
| case HTMLPARSER_INPUTTYPE_TEXT: |
| for(i = 0; i < s.inputvaluesize; ++i) { |
| if(s.inputvalue[i] == 0) { |
| memset(&s.inputvalue[i], ISO_space, s.inputvaluesize - i); |
| s.inputvalue[s.inputvaluesize] = 0; |
| break; |
| } |
| } |
| htmlparser_inputfield(s.inputvalue, s.inputname, |
| s.formname, s.formaction); |
| break; |
| case HTMLPARSER_INPUTTYPE_SUBMIT: |
| case HTMLPARSER_INPUTTYPE_IMAGE: |
| htmlparser_submitbutton(s.inputvalue, s.inputname, |
| s.formname, s.formaction); |
| break; |
| } |
| s.inputtype = HTMLPARSER_INPUTTYPE_NONE; |
| } else { |
| PRINTF(("Input '%s' '%s'\n", s.tagattr, s.tagattrparam)); |
| if(strncmp(s.tagattr, html_type, sizeof(html_type)) == 0) { |
| if(strncmp(s.tagattrparam, html_submit, |
| sizeof(html_submit)) == 0) { |
| s.inputtype = HTMLPARSER_INPUTTYPE_SUBMIT; |
| } else if(strncmp(s.tagattrparam, html_image, |
| sizeof(html_image)) == 0) { |
| s.inputtype = HTMLPARSER_INPUTTYPE_IMAGE; |
| } else if(strncmp(s.tagattrparam, html_text, |
| sizeof(html_text)) == 0) { |
| s.inputtype = HTMLPARSER_INPUTTYPE_TEXT; |
| } else { |
| s.inputtype = HTMLPARSER_INPUTTYPE_OTHER; |
| } |
| } else if(strncmp(s.tagattr, html_name, |
| sizeof(html_name)) == 0) { |
| strncpy(s.inputname, s.tagattrparam, |
| WWW_CONF_MAX_INPUTNAMELEN); |
| } else if(strncmp(s.tagattr, html_alt, |
| sizeof(html_alt)) == 0 && |
| s.inputtype == HTMLPARSER_INPUTTYPE_IMAGE) { |
| strncpy(s.inputvalue, s.tagattrparam, |
| WWW_CONF_MAX_INPUTVALUELEN); |
| } else if(strncmp(s.tagattr, html_value, |
| sizeof(html_value)) == 0) { |
| strncpy(s.inputvalue, s.tagattrparam, |
| WWW_CONF_MAX_INPUTVALUELEN); |
| } else if(strncmp(s.tagattr, html_size, |
| sizeof(html_size)) == 0) { |
| size = 0; |
| if(s.tagattrparam[0] >= '0' && |
| s.tagattrparam[0] <= '9') { |
| size = s.tagattrparam[0] - '0'; |
| if(s.tagattrparam[1] >= '0' && |
| s.tagattrparam[1] <= '9') { |
| size = size * 10 + (s.tagattrparam[1] - '0'); |
| } |
| } |
| if(size >= WWW_CONF_MAX_INPUTVALUELEN) { |
| size = WWW_CONF_MAX_INPUTVALUELEN - 1; |
| } |
| s.inputvaluesize = size; |
| /* strncpy(s.inputvalue, s.tagattrparam, |
| WWW_CONF_MAX_INPUTVALUELEN);*/ |
| } |
| } |
| |
| } |
| break; |
| #endif /* WWW_CONF_FORMS */ |
| #if WWW_CONF_RENDERSTATE |
| case TAG_CENTER: |
| /* parse_char(ISO_nl); */ |
| newline(); |
| htmlparser_renderstate(HTMLPARSER_RENDERSTATE_BEGIN | |
| HTMLPARSER_RENDERSTATE_CENTER); |
| break; |
| case TAG_SLASHCENTER: |
| /* parse_char(ISO_nl);*/ |
| newline(); |
| htmlparser_renderstate(HTMLPARSER_RENDERSTATE_END | |
| HTMLPARSER_RENDERSTATE_CENTER); |
| break; |
| #endif /* WWW_CONF_RENDERSTATE */ |
| } |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| static u16_t |
| parse_word(char *data, u8_t dlen) |
| { |
| static u8_t i; |
| static u8_t len; |
| unsigned char c; |
| |
| len = dlen; |
| |
| switch(s.minorstate) { |
| case MINORSTATE_TEXT: |
| for(i = 0; i < len; ++i) { |
| c = data[i]; |
| if(iswhitespace(c)) { |
| do_word(); |
| } else if(c == ISO_lt) { |
| s.minorstate = MINORSTATE_TAG; |
| s.tagptr = 0; |
| /* do_word();*/ |
| break; |
| } else if(c == ISO_ampersand) { |
| s.minorstate = MINORSTATE_EXTCHAR; |
| break; |
| } else { |
| add_char(c); |
| } |
| } |
| break; |
| case MINORSTATE_EXTCHAR: |
| for(i = 0; i < len; ++i) { |
| c = data[i]; |
| if(c == ISO_semicolon) { |
| s.minorstate = MINORSTATE_TEXT; |
| add_char(' '); |
| break; |
| } else if(iswhitespace(c)) { |
| s.minorstate = MINORSTATE_TEXT; |
| add_char('&'); |
| add_char(' '); |
| break; |
| } |
| } |
| break; |
| case MINORSTATE_TAG: |
| /* We are currently parsing within the name of a tag. We check |
| for the end of a tag (the '>' character) or whitespace (which |
| indicates that we should parse a tag attr argument |
| instead). */ |
| for(i = 0; i < len; ++i) { |
| c = data[i]; |
| if(c == ISO_gt) { |
| /* Full tag found. We continue parsing regular text. */ |
| s.minorstate = MINORSTATE_TEXT; |
| s.tagattrptr = s.tagattrparamptr = 0; |
| endtagfound(); |
| parse_tag(); |
| break; |
| } else if(iswhitespace(c)) { |
| /* The name of the tag found. We continue parsing the tag |
| attr.*/ |
| s.minorstate = MINORSTATE_TAGATTR; |
| s.tagattrptr = 0; |
| endtagfound(); |
| break; |
| } else { |
| /* Keep track of the name of the tag, but convert it to |
| lower case. */ |
| |
| s.tag[s.tagptr] = lowercase(c); |
| ++s.tagptr; |
| /* Check if the ->tag field is full. If so, we just eat up |
| any data left in the tag. */ |
| if(s.tagptr == sizeof(s.tag)) { |
| s.minorstate = MINORSTATE_TAGEND; |
| break; |
| } |
| } |
| |
| /* Check for HTML comment, indicated by <!-- */ |
| if(s.tagptr == 3 && |
| s.tag[0] == ISO_bang && |
| s.tag[1] == ISO_dash && |
| s.tag[2] == ISO_dash) { |
| PRINTF(("Starting comment...\n")); |
| s.minorstate = MINORSTATE_HTMLCOMMENT; |
| s.tagptr = 0; |
| endtagfound(); |
| break; |
| } |
| } |
| break; |
| case MINORSTATE_TAGATTR: |
| /* We parse the "tag attr", i.e., the "href" in <a |
| href="...">. */ |
| for(i = 0; i < len; ++i) { |
| c = data[i]; |
| if(c == ISO_gt) { |
| /* Full tag found. */ |
| s.minorstate = MINORSTATE_TEXT; |
| s.tagattrparamptr = 0; |
| s.tagattrptr = 0; |
| endtagfound(); |
| parse_tag(); |
| s.tagptr = 0; |
| endtagfound(); |
| break; |
| } else if(iswhitespace(c)) { |
| if(s.tagattrptr == 0) { |
| /* Discard leading spaces. */ |
| } else { |
| /* A non-leading space is the end of the attribute. */ |
| s.tagattrparamptr = 0; |
| endtagfound(); |
| parse_tag(); |
| s.minorstate = MINORSTATE_TAGATTRSPACE; |
| break; |
| /* s.tagattrptr = 0; |
| endtagfound();*/ |
| } |
| } else if(c == ISO_eq) { |
| s.minorstate = MINORSTATE_TAGATTRPARAMNQ; |
| s.tagattrparamptr = 0; |
| endtagfound(); |
| break; |
| } else { |
| s.tagattr[s.tagattrptr] = lowercase(c); |
| ++s.tagattrptr; |
| /* Check if the "tagattr" field is full. If so, we just eat |
| up any data left in the tag. */ |
| if(s.tagattrptr == sizeof(s.tagattr)) { |
| s.minorstate = MINORSTATE_TAGEND; |
| break; |
| } |
| } |
| } |
| break; |
| case MINORSTATE_TAGATTRSPACE: |
| for(i = 0; i < len; ++i) { |
| c = data[i]; |
| if(iswhitespace(c)) { |
| /* Discard spaces. */ |
| } else if(c == ISO_eq) { |
| s.minorstate = MINORSTATE_TAGATTRPARAMNQ; |
| s.tagattrparamptr = 0; |
| endtagfound(); |
| parse_tag(); |
| break; |
| } else { |
| s.tagattr[0] = lowercase(c); |
| s.tagattrptr = 1; |
| s.minorstate = MINORSTATE_TAGATTR; |
| break; |
| } |
| } |
| break; |
| case MINORSTATE_TAGATTRPARAMNQ: |
| /* We are parsing the "tag attr parameter", i.e., the link part |
| in <a href="link">. */ |
| for(i = 0; i < len; ++i) { |
| c = data[i]; |
| if(c == ISO_gt) { |
| /* Full tag found. */ |
| endtagfound(); |
| parse_tag(); |
| s.minorstate = MINORSTATE_TEXT; |
| s.tagattrptr = 0; |
| endtagfound(); |
| parse_tag(); |
| s.tagptr = 0; |
| endtagfound(); |
| break; |
| } else if(iswhitespace(c) && |
| s.tagattrparamptr == 0) { |
| /* Discard leading spaces. */ |
| } else if((c == ISO_citation || |
| c == ISO_citation2) && |
| s.tagattrparamptr == 0) { |
| s.minorstate = MINORSTATE_TAGATTRPARAM; |
| s.quotechar = c; |
| PRINTF(("tag attr param q found\n")); |
| break; |
| } else if(iswhitespace(c)) { |
| PRINTF(("Non-leading space found at %d\n", |
| s.tagattrparamptr)); |
| /* Stop parsing if a non-leading space was found */ |
| endtagfound(); |
| parse_tag(); |
| |
| s.minorstate = MINORSTATE_TAGATTR; |
| s.tagattrptr = 0; |
| endtagfound(); |
| break; |
| } else { |
| s.tagattrparam[s.tagattrparamptr] = c; |
| ++s.tagattrparamptr; |
| /* Check if the "tagattr" field is full. If so, we just eat |
| up any data left in the tag. */ |
| if(s.tagattrparamptr >= sizeof(s.tagattrparam) - 1) { |
| s.minorstate = MINORSTATE_TAGEND; |
| break; |
| } |
| } |
| } |
| break; |
| case MINORSTATE_TAGATTRPARAM: |
| /* We are parsing the "tag attr parameter", i.e., the link |
| part in <a href="link">. */ |
| for(i = 0; i < len; ++i) { |
| c = data[i]; |
| if(c == s.quotechar) { |
| /* Found end of tag attr parameter. */ |
| endtagfound(); |
| parse_tag(); |
| |
| s.minorstate = MINORSTATE_TAGATTR; |
| s.tagattrptr = 0; |
| endtagfound(); |
| break; |
| } else { |
| if(iswhitespace(c)) { |
| s.tagattrparam[s.tagattrparamptr] = ISO_space; |
| } else { |
| s.tagattrparam[s.tagattrparamptr] = c; |
| } |
| |
| ++s.tagattrparamptr; |
| /* Check if the "tagattr" field is full. If so, we just eat |
| up any data left in the tag. */ |
| if(s.tagattrparamptr >= sizeof(s.tagattrparam) - 1) { |
| s.minorstate = MINORSTATE_TAGEND; |
| break; |
| } |
| } |
| } |
| break; |
| case MINORSTATE_HTMLCOMMENT: |
| for(i = 0; i < len; ++i) { |
| c = data[i]; |
| if(c == ISO_dash) { |
| ++s.tagptr; |
| } else if(c == ISO_gt && s.tagptr > 0) { |
| PRINTF(("Comment done.\n")); |
| s.minorstate = MINORSTATE_TEXT; |
| break; |
| } else { |
| s.tagptr = 0; |
| } |
| } |
| break; |
| case MINORSTATE_TAGEND: |
| /* Discard characters until a '>' is seen. */ |
| for(i = 0; i < len; ++i) { |
| if(data[i] == ISO_gt) { |
| s.minorstate = MINORSTATE_TEXT; |
| s.tagattrptr = 0; |
| endtagfound(); |
| parse_tag(); |
| break; |
| } |
| } |
| break; |
| default: |
| i = 0; |
| break; |
| } |
| if(i >= len) { |
| return len; |
| } |
| return i + 1; |
| } |
| /*-----------------------------------------------------------------------------------*/ |
| void |
| htmlparser_parse(char *data, u16_t datalen) |
| { |
| u16_t plen; |
| |
| while(datalen > 0) { |
| if(datalen > 255) { |
| plen = parse_word(data, 255); |
| } else { |
| plen = parse_word(data, datalen); |
| } |
| datalen -= plen; |
| data += plen; |
| } |
| } |
| /*-----------------------------------------------------------------------------------*/ |