URI: 
       tadd initial "optional" tag handling, rework some tag handling - webdump - [FORK] git://git.codemadness.org/webdump
  HTML git clone git://git.z3bra.org/webdump.git
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
   DIR commit 1c95e7d86a0dc62670a87f755b3507ceab912ec1
   DIR parent ff081bef0cfc66b4a5960996a0fc41dea868ac60
  HTML Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Wed, 11 Mar 2020 15:44:22 +0100
       
       add initial "optional" tag handling, rework some tag handling
       
       - add initial support for "optional" closing tags which is used in HTML, not
         XHTML.
       - align the tags table.
       - rework some end tag handling.
       
       Diffstat:
         M webdump.c                           |     280 +++++++++++++++++++------------
       
       1 file changed, 177 insertions(+), 103 deletions(-)
       ---
   DIR diff --git a/webdump.c b/webdump.c
       t@@ -50,11 +50,11 @@ static int linkcount;
        
        enum DisplayType {
                DisplayUnknown     = 0,
       -        DisplayNone        = 1 << 0,
       -        DisplayPre         = 1 << 1,
       -        DisplayInline      = 1 << 2,
       -        DisplayInlineBlock = 1 << 3,
       -        DisplayBlock       = 1 << 4,
       +        DisplayInline      = 1 << 0,
       +        DisplayInlineBlock = 1 << 1,
       +        DisplayBlock       = 1 << 2,
       +        DisplayNone        = 1 << 3,
       +        DisplayPre         = 1 << 4,
                DisplayList        = 1 << 5,
                DisplayListOrdered = 1 << 6,
                DisplayListItem    = 1 << 7,
       t@@ -75,6 +75,7 @@ struct tag {
                enum DisplayType displaytype;
                enum DisplayType parenttype; /* display type belonging to element */
                int isvoid; /* "void" element */
       +        int isoptional; /* optional to close tag */
        };
        
        struct node {
       t@@ -112,60 +113,84 @@ static size_t ncharsline = 0;
        static struct node nodes[MAX_DEPTH];
        static int curnode;
        
       +#if 0
       +/* TODO: optional tags */
       +{ "body",     0, 0, 0, 1 },
       +{ "colgroup", 0, 0, 0, 1 },
       +{ "dd",       0, 0, 0, 1 },
       +{ "dt",       0, 0, 0, 1 },
       +{ "head",     0, 0, 0, 1 },
       +{ "html",     0, 0, 0, 1 },
       +{ "li",       0, 0, 0, 1 },
       +{ "optgroup", 0, 0, 0, 1 },
       +{ "option",   0, 0, 0, 1 },
       +{ "option",   0, 0, 0, 1 },
       +{ "p",        0, 0, 0, 1 },
       +{ "rp",       0, 0, 0, 1 },
       +{ "rt",       0, 0, 0, 1 },
       +{ "tbody",    0, 0, 0, 1 },
       +{ "td",       0, 0, 0, 1 },
       +{ "tfoot",    0, 0, 0, 1 },
       +{ "th",       0, 0, 0, 1 },
       +{ "thead",    0, 0, 0, 1 },
       +{ "tr",       0, 0, 0, 1 },
       +#endif
       +
       +/* tag          displaytype                       p                v  o */
        static struct tag tags[] = {
       -{ "a",       DisplayInline | DisplayUnderline, 0, 0 },
       -{ "area",    DisplayInline, 0, 1 },
       -{ "article", DisplayBlock },
       -{ "audio",   DisplayInline | DisplayUnderline, 0, 0 },
       -{ "b",       DisplayInline | DisplayBold },
       -{ "base",    DisplayInline, 0, 1 },
       -{ "blink",   DisplayInline | DisplayBlink },
       -{ "blockquote", DisplayBlock },
       -{ "br",      0, 0, 1 },
       -{ "code",    DisplayPre },
       -{ "col",     DisplayInline, 0, 1 },
       -{ "del",     DisplayInline | DisplayStrike },
       -{ "div",     DisplayBlock },
       -{ "em",      DisplayInline | DisplayItalic },
       -{ "embed",   DisplayInline, 0, 1 },
       -{ "footer",  DisplayBlock },
       -{ "h1",      DisplayHeader | DisplayBold },
       -{ "h2",      DisplayHeader | DisplayBold },
       -{ "h3",      DisplayHeader | DisplayBold },
       -{ "h4",      DisplayHeader | DisplayBold },
       -{ "h5",      DisplayHeader | DisplayBold },
       -{ "h6",      DisplayHeader | DisplayBold },
       -{ "header",  DisplayBlock },
       -{ "hr",      DisplayBlock, 0, 1 },
       -{ "i",       DisplayInline | DisplayItalic },
       -{ "img",     DisplayInline | DisplayUnderline, 0, 1 },
       -{ "input",   DisplayInline, 0, 1 },
       -{ "li",      DisplayListItem, DisplayList },
       -{ "link",    DisplayInline, 0, 1 },
       -{ "main",    DisplayBlock },
       -{ "meta",    DisplayInline, 0, 1 },
       -{ "nav",     DisplayBlock },
       -{ "ol",      DisplayList | DisplayListOrdered },
       -{ "p",       DisplayBlock },
       -{ "param",   DisplayInline, 0, 1 },
       -{ "pre",     DisplayPre },
       -{ "s",       DisplayInline | DisplayStrike },
       -{ "script",  DisplayNone },
       -{ "source",  DisplayInline, 0, 1 },
       -{ "strike",  DisplayInline | DisplayStrike },
       -{ "strong",  DisplayInline | DisplayBold },
       -{ "style",   DisplayNone },
       -{ "table",   DisplayTable },
       -{ "td",      DisplayTableCell, DisplayTableRow },
       -{ "template", DisplayNone },
       -{ "th",      DisplayTableCell | DisplayBold, DisplayTableRow },
       -{ "title",   DisplayBlock },
       -{ "tr",      DisplayTableRow, DisplayTable },
       -{ "track",   DisplayInline, 0, 1 },
       -{ "u",       DisplayInline | DisplayUnderline },
       -{ "ul",      DisplayList },
       -{ "video",   DisplayInline | DisplayUnderline, 0, 0 },
       -{ "wbr",     DisplayInline, 0, 1 },
       +{ "a",          DisplayInline | DisplayUnderline, 0,               0, 0 },
       +{ "area",       DisplayInline,                    0,               1, 0 },
       +{ "article",    DisplayBlock,                     0,               0, 0 },
       +{ "audio",      DisplayInline | DisplayUnderline, 0,               0, 0 },
       +{ "b",          DisplayInline | DisplayBold,      0,               0, 0 },
       +{ "base",       DisplayInline,                    0,               1, 0 },
       +{ "blink",      DisplayInline | DisplayBlink,     0,               0, 0 },
       +{ "blockquote", DisplayBlock,                     0,               0, 0 },
       +{ "br",         0,                                0,               1, 0 },
       +{ "code",       DisplayPre,                       0,               0, 0 },
       +{ "col",        DisplayInline,                    0,               1, 0 },
       +{ "del",        DisplayInline | DisplayStrike,    0,               0, 0 },
       +{ "div",        DisplayBlock,                     0,               0, 0 },
       +{ "em",         DisplayInline | DisplayItalic,    0,               0, 0 },
       +{ "embed",      DisplayInline,                    0,               1, 0 },
       +{ "footer",     DisplayBlock,                     0,               0, 0 },
       +{ "h1",         DisplayHeader | DisplayBold,      0,               0, 0 },
       +{ "h2",         DisplayHeader | DisplayBold,      0,               0, 0 },
       +{ "h3",         DisplayHeader | DisplayBold,      0,               0, 0 },
       +{ "h4",         DisplayHeader | DisplayBold,      0,               0, 0 },
       +{ "h5",         DisplayHeader | DisplayBold,      0,               0, 0 },
       +{ "h6",         DisplayHeader | DisplayBold,      0,               0, 0 },
       +{ "header",     DisplayBlock,                     0,               0, 0 },
       +{ "hr",         DisplayBlock,                     0,               1, 0 },
       +{ "i",          DisplayInline | DisplayItalic,    0,               0, 0 },
       +{ "img",        DisplayInline | DisplayUnderline, 0,               1, 0 },
       +{ "input",      DisplayInline,                    0,               1, 0 },
       +{ "li",         DisplayListItem,                  DisplayList,     0, 1 },
       +{ "link",       DisplayInline,                    0,               1, 0 },
       +{ "main",       DisplayBlock,                     0,               0, 0 },
       +{ "meta",       DisplayInline,                    0,               1, 0 },
       +{ "nav",        DisplayBlock,                     0,               0, 0 },
       +{ "ol",         DisplayList | DisplayListOrdered, 0,               0, 0 },
       +{ "p",          DisplayBlock,                     0,               0, 1 },
       +{ "param",      DisplayInline,                    0,               1, 0 },
       +{ "pre",        DisplayPre,                       0,               0, 0 },
       +{ "s",          DisplayInline | DisplayStrike,    0,               0, 0 },
       +{ "script",     DisplayNone,                      0,               0, 0 },
       +{ "source",     DisplayInline,                    0,               1, 0 },
       +{ "strike",     DisplayInline | DisplayStrike,    0,               0, 0 },
       +{ "strong",     DisplayInline | DisplayBold,      0,               0, 0 },
       +{ "style",      DisplayNone,                      0,               0, 0 },
       +{ "table",      DisplayTable,                     0,               0, 0 },
       +{ "td",         DisplayTableCell,                 DisplayTableRow, 0, 0 },
       +{ "template",   DisplayNone,                      0,               0, 0 },
       +{ "th",         DisplayTableCell | DisplayBold,   DisplayTableRow, 0, 1 },
       +{ "title",      DisplayBlock,                     0,               0, 0 },
       +{ "tr",         DisplayTableRow,                  DisplayTable,    0, 1 },
       +{ "track",      DisplayInline,                    0,               1, 0 },
       +{ "u",          DisplayInline | DisplayUnderline, 0,               0, 0 },
       +{ "ul",         DisplayList,                      0,               0, 0 },
       +{ "video",      DisplayInline | DisplayUnderline, 0,               0, 0 },
       +{ "wbr",        DisplayInline,                    0,               1, 0 },
        };
        
        static const char *ignorestate, *endtag;
       t@@ -560,7 +585,6 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
                if (cur->tag.displaytype & DisplayNone)
                        return;
        
       -        /* convert basic XML entities */
                n = xml_entitytostr(data, buf, sizeof(buf));
                if (n > 0)
                        xmldata(p, buf, (size_t)n);
       t@@ -588,50 +612,12 @@ findtag(const char *t)
        }
        
        static void
       -xmltagstart(XMLParser *x, const char *t, size_t tl)
       +tagend(struct node *cur)
        {
       -        struct tag *found;
       -        struct node *cur, *parent;
       -
       -        if (curnode >= MAX_DEPTH - 2)
       -                errx(1, "max tag depth reached: %d\n", curnode);
       -        parent = &nodes[curnode];
       -        curnode++;
       -
       -        cur = &nodes[curnode];
       -        memset(cur, 0, sizeof(*cur));
       -        /* tag defaults */
       -        cur->tag.displaytype = DisplayInline;
       -        cur->tag.name = cur->tagname;
       -        strlcpy(cur->tagname, t, sizeof(cur->tagname));
       -
       -        /* match tag */
       -        if ((found = findtag(t))) {
       -                cur->nchildren = 0;
       -                memcpy(&(cur->tag), found, sizeof(*found));
       -                /* parent tag is hidden, so hide ourself too */
       -                if (parent->tag.displaytype & DisplayNone)
       -                        cur->tag.displaytype |= DisplayNone;
       -                return;
       -        }
       -
       -        src[0] = '\0'; /* reset src, href */
       -}
       -
       -static void
       -xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
       -{
       -        struct tag *found;
       -        struct node *cur;
       -        int i;
       -
       -        /* ignore closing of void elements, like </br>, which is not allowed */
       -        if ((found = findtag(t))) {
       -                if (!isshort && found->isvoid)
       -                        return;
       -        }
       +        const char *t;
       +        size_t i;
        
       -        cur = &nodes[curnode];
       +        t = cur->tag.name;
        
                if (cur->tag.displaytype & DisplayBold)
                        printansi("\033[22m"); /* reset bold or faint */
       t@@ -659,7 +645,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
                } else if (cur->tag.displaytype & DisplayHeader) {
                        newline();
        #if 1
       -                if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') {
       +                if (t[0] == 'h' && t[1] >= '1' && t[1] <= '6' && t[2] == '\0') {
                                if (t[1] >= '3')
                                        for (i = 0; i < termwidth; i++)
                                                putchar('-');
       t@@ -671,7 +657,95 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
        #endif
                }
        
       -        curnode--;
       +}
       +
       +static void
       +xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
       +{
       +        struct tag *found;
       +        int i;
       +
       +        /* ignore closing of void elements, like </br>, which is not allowed */
       +        if ((found = findtag(t))) {
       +                if (!isshort && found->isvoid)
       +                        return;
       +        }
       +
       +        /* if the current closing tag matches the current open tag */
       +        if (!strcasecmp(nodes[curnode].tag.name, t)) {
       +                tagend(&nodes[curnode]);
       +                if (curnode)
       +                        curnode--;
       +        } else {
       +                /* ... else lookup the first matching start tag. This is also
       +                   for handling optional closing tags */
       +                for (i = curnode; i > 0; i--) {
       +                        if (!strcasecmp(nodes[i].tag.name, t)) {
       +                                tagend(&nodes[i]);
       +                                curnode = i;
       +                                break;
       +                        }
       +                }
       +                if (curnode)
       +                        curnode--;
       +        }
       +}
       +
       +/* check if the specified tag is closed at some point in the current tree */
       +static int
       +istagclosed(int cur)
       +{
       +        int i;
       +
       +        if (!cur)
       +                return 0;
       +        for (i = cur - 1; i > 0; i--) {
       +                if (!strcasecmp(nodes[i].tag.name, nodes[cur].tag.name))
       +                        return 0;
       +        }
       +        return 1;
       +}
       +
       +static void
       +xmltagstart(XMLParser *x, const char *t, size_t tl)
       +{
       +        struct tag *found;
       +        struct node *cur, *parent;
       +        char *s;
       +
       +        if (curnode >= MAX_DEPTH - 2)
       +                errx(1, "max tag depth reached: %d\n", curnode);
       +        parent = &nodes[curnode];
       +        curnode++;
       +
       +        cur = &nodes[curnode];
       +        memset(cur, 0, sizeof(*cur));
       +        /* tag defaults */
       +        cur->tag.displaytype = DisplayInline;
       +        cur->tag.name = cur->tagname;
       +        strlcpy(cur->tagname, t, sizeof(cur->tagname));
       +        /* to lowercase */
       +        for (s = cur->tagname; *s; s++)
       +                *s = tolower((unsigned char)*s);
       +
       +        /* match tag */
       +        if ((found = findtag(t))) {
       +                cur->nchildren = 0;
       +                memcpy(&(cur->tag), found, sizeof(*found));
       +
       +                if (cur->tag.isoptional && curnode && !istagclosed(curnode)) {
       +                        /* if it's an unclosed tag and it has parent (like ol, ul)
       +                           then fake the end tag. */
       +                        tagend(&nodes[curnode]);
       +                }
       +
       +                /* parent tag is hidden, so hide ourself too */
       +                if (parent->tag.displaytype & DisplayNone)
       +                        cur->tag.displaytype |= DisplayNone;
       +                return;
       +        }
       +
       +        src[0] = '\0'; /* reset src, href */
        }
        
        static void