tadd initial "optional" tag handling, rework some tag handling - webdump - [FORK] git://git.codemadness.org/webdump
HTML git clone git://git.z3bra.org/webdump.git
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
DIR commit 1c95e7d86a0dc62670a87f755b3507ceab912ec1
DIR parent ff081bef0cfc66b4a5960996a0fc41dea868ac60
HTML Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Wed, 11 Mar 2020 15:44:22 +0100
add initial "optional" tag handling, rework some tag handling
- add initial support for "optional" closing tags which is used in HTML, not
XHTML.
- align the tags table.
- rework some end tag handling.
Diffstat:
M webdump.c | 280 +++++++++++++++++++------------
1 file changed, 177 insertions(+), 103 deletions(-)
---
DIR diff --git a/webdump.c b/webdump.c
t@@ -50,11 +50,11 @@ static int linkcount;
enum DisplayType {
DisplayUnknown = 0,
- DisplayNone = 1 << 0,
- DisplayPre = 1 << 1,
- DisplayInline = 1 << 2,
- DisplayInlineBlock = 1 << 3,
- DisplayBlock = 1 << 4,
+ DisplayInline = 1 << 0,
+ DisplayInlineBlock = 1 << 1,
+ DisplayBlock = 1 << 2,
+ DisplayNone = 1 << 3,
+ DisplayPre = 1 << 4,
DisplayList = 1 << 5,
DisplayListOrdered = 1 << 6,
DisplayListItem = 1 << 7,
t@@ -75,6 +75,7 @@ struct tag {
enum DisplayType displaytype;
enum DisplayType parenttype; /* display type belonging to element */
int isvoid; /* "void" element */
+ int isoptional; /* optional to close tag */
};
struct node {
t@@ -112,60 +113,84 @@ static size_t ncharsline = 0;
static struct node nodes[MAX_DEPTH];
static int curnode;
+#if 0
+/* TODO: optional tags */
+{ "body", 0, 0, 0, 1 },
+{ "colgroup", 0, 0, 0, 1 },
+{ "dd", 0, 0, 0, 1 },
+{ "dt", 0, 0, 0, 1 },
+{ "head", 0, 0, 0, 1 },
+{ "html", 0, 0, 0, 1 },
+{ "li", 0, 0, 0, 1 },
+{ "optgroup", 0, 0, 0, 1 },
+{ "option", 0, 0, 0, 1 },
+{ "option", 0, 0, 0, 1 },
+{ "p", 0, 0, 0, 1 },
+{ "rp", 0, 0, 0, 1 },
+{ "rt", 0, 0, 0, 1 },
+{ "tbody", 0, 0, 0, 1 },
+{ "td", 0, 0, 0, 1 },
+{ "tfoot", 0, 0, 0, 1 },
+{ "th", 0, 0, 0, 1 },
+{ "thead", 0, 0, 0, 1 },
+{ "tr", 0, 0, 0, 1 },
+#endif
+
+/* tag displaytype p v o */
static struct tag tags[] = {
-{ "a", DisplayInline | DisplayUnderline, 0, 0 },
-{ "area", DisplayInline, 0, 1 },
-{ "article", DisplayBlock },
-{ "audio", DisplayInline | DisplayUnderline, 0, 0 },
-{ "b", DisplayInline | DisplayBold },
-{ "base", DisplayInline, 0, 1 },
-{ "blink", DisplayInline | DisplayBlink },
-{ "blockquote", DisplayBlock },
-{ "br", 0, 0, 1 },
-{ "code", DisplayPre },
-{ "col", DisplayInline, 0, 1 },
-{ "del", DisplayInline | DisplayStrike },
-{ "div", DisplayBlock },
-{ "em", DisplayInline | DisplayItalic },
-{ "embed", DisplayInline, 0, 1 },
-{ "footer", DisplayBlock },
-{ "h1", DisplayHeader | DisplayBold },
-{ "h2", DisplayHeader | DisplayBold },
-{ "h3", DisplayHeader | DisplayBold },
-{ "h4", DisplayHeader | DisplayBold },
-{ "h5", DisplayHeader | DisplayBold },
-{ "h6", DisplayHeader | DisplayBold },
-{ "header", DisplayBlock },
-{ "hr", DisplayBlock, 0, 1 },
-{ "i", DisplayInline | DisplayItalic },
-{ "img", DisplayInline | DisplayUnderline, 0, 1 },
-{ "input", DisplayInline, 0, 1 },
-{ "li", DisplayListItem, DisplayList },
-{ "link", DisplayInline, 0, 1 },
-{ "main", DisplayBlock },
-{ "meta", DisplayInline, 0, 1 },
-{ "nav", DisplayBlock },
-{ "ol", DisplayList | DisplayListOrdered },
-{ "p", DisplayBlock },
-{ "param", DisplayInline, 0, 1 },
-{ "pre", DisplayPre },
-{ "s", DisplayInline | DisplayStrike },
-{ "script", DisplayNone },
-{ "source", DisplayInline, 0, 1 },
-{ "strike", DisplayInline | DisplayStrike },
-{ "strong", DisplayInline | DisplayBold },
-{ "style", DisplayNone },
-{ "table", DisplayTable },
-{ "td", DisplayTableCell, DisplayTableRow },
-{ "template", DisplayNone },
-{ "th", DisplayTableCell | DisplayBold, DisplayTableRow },
-{ "title", DisplayBlock },
-{ "tr", DisplayTableRow, DisplayTable },
-{ "track", DisplayInline, 0, 1 },
-{ "u", DisplayInline | DisplayUnderline },
-{ "ul", DisplayList },
-{ "video", DisplayInline | DisplayUnderline, 0, 0 },
-{ "wbr", DisplayInline, 0, 1 },
+{ "a", DisplayInline | DisplayUnderline, 0, 0, 0 },
+{ "area", DisplayInline, 0, 1, 0 },
+{ "article", DisplayBlock, 0, 0, 0 },
+{ "audio", DisplayInline | DisplayUnderline, 0, 0, 0 },
+{ "b", DisplayInline | DisplayBold, 0, 0, 0 },
+{ "base", DisplayInline, 0, 1, 0 },
+{ "blink", DisplayInline | DisplayBlink, 0, 0, 0 },
+{ "blockquote", DisplayBlock, 0, 0, 0 },
+{ "br", 0, 0, 1, 0 },
+{ "code", DisplayPre, 0, 0, 0 },
+{ "col", DisplayInline, 0, 1, 0 },
+{ "del", DisplayInline | DisplayStrike, 0, 0, 0 },
+{ "div", DisplayBlock, 0, 0, 0 },
+{ "em", DisplayInline | DisplayItalic, 0, 0, 0 },
+{ "embed", DisplayInline, 0, 1, 0 },
+{ "footer", DisplayBlock, 0, 0, 0 },
+{ "h1", DisplayHeader | DisplayBold, 0, 0, 0 },
+{ "h2", DisplayHeader | DisplayBold, 0, 0, 0 },
+{ "h3", DisplayHeader | DisplayBold, 0, 0, 0 },
+{ "h4", DisplayHeader | DisplayBold, 0, 0, 0 },
+{ "h5", DisplayHeader | DisplayBold, 0, 0, 0 },
+{ "h6", DisplayHeader | DisplayBold, 0, 0, 0 },
+{ "header", DisplayBlock, 0, 0, 0 },
+{ "hr", DisplayBlock, 0, 1, 0 },
+{ "i", DisplayInline | DisplayItalic, 0, 0, 0 },
+{ "img", DisplayInline | DisplayUnderline, 0, 1, 0 },
+{ "input", DisplayInline, 0, 1, 0 },
+{ "li", DisplayListItem, DisplayList, 0, 1 },
+{ "link", DisplayInline, 0, 1, 0 },
+{ "main", DisplayBlock, 0, 0, 0 },
+{ "meta", DisplayInline, 0, 1, 0 },
+{ "nav", DisplayBlock, 0, 0, 0 },
+{ "ol", DisplayList | DisplayListOrdered, 0, 0, 0 },
+{ "p", DisplayBlock, 0, 0, 1 },
+{ "param", DisplayInline, 0, 1, 0 },
+{ "pre", DisplayPre, 0, 0, 0 },
+{ "s", DisplayInline | DisplayStrike, 0, 0, 0 },
+{ "script", DisplayNone, 0, 0, 0 },
+{ "source", DisplayInline, 0, 1, 0 },
+{ "strike", DisplayInline | DisplayStrike, 0, 0, 0 },
+{ "strong", DisplayInline | DisplayBold, 0, 0, 0 },
+{ "style", DisplayNone, 0, 0, 0 },
+{ "table", DisplayTable, 0, 0, 0 },
+{ "td", DisplayTableCell, DisplayTableRow, 0, 0 },
+{ "template", DisplayNone, 0, 0, 0 },
+{ "th", DisplayTableCell | DisplayBold, DisplayTableRow, 0, 1 },
+{ "title", DisplayBlock, 0, 0, 0 },
+{ "tr", DisplayTableRow, DisplayTable, 0, 1 },
+{ "track", DisplayInline, 0, 1, 0 },
+{ "u", DisplayInline | DisplayUnderline, 0, 0, 0 },
+{ "ul", DisplayList, 0, 0, 0 },
+{ "video", DisplayInline | DisplayUnderline, 0, 0, 0 },
+{ "wbr", DisplayInline, 0, 1, 0 },
};
static const char *ignorestate, *endtag;
t@@ -560,7 +585,6 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
if (cur->tag.displaytype & DisplayNone)
return;
- /* convert basic XML entities */
n = xml_entitytostr(data, buf, sizeof(buf));
if (n > 0)
xmldata(p, buf, (size_t)n);
t@@ -588,50 +612,12 @@ findtag(const char *t)
}
static void
-xmltagstart(XMLParser *x, const char *t, size_t tl)
+tagend(struct node *cur)
{
- struct tag *found;
- struct node *cur, *parent;
-
- if (curnode >= MAX_DEPTH - 2)
- errx(1, "max tag depth reached: %d\n", curnode);
- parent = &nodes[curnode];
- curnode++;
-
- cur = &nodes[curnode];
- memset(cur, 0, sizeof(*cur));
- /* tag defaults */
- cur->tag.displaytype = DisplayInline;
- cur->tag.name = cur->tagname;
- strlcpy(cur->tagname, t, sizeof(cur->tagname));
-
- /* match tag */
- if ((found = findtag(t))) {
- cur->nchildren = 0;
- memcpy(&(cur->tag), found, sizeof(*found));
- /* parent tag is hidden, so hide ourself too */
- if (parent->tag.displaytype & DisplayNone)
- cur->tag.displaytype |= DisplayNone;
- return;
- }
-
- src[0] = '\0'; /* reset src, href */
-}
-
-static void
-xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
-{
- struct tag *found;
- struct node *cur;
- int i;
-
- /* ignore closing of void elements, like </br>, which is not allowed */
- if ((found = findtag(t))) {
- if (!isshort && found->isvoid)
- return;
- }
+ const char *t;
+ size_t i;
- cur = &nodes[curnode];
+ t = cur->tag.name;
if (cur->tag.displaytype & DisplayBold)
printansi("\033[22m"); /* reset bold or faint */
t@@ -659,7 +645,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
} else if (cur->tag.displaytype & DisplayHeader) {
newline();
#if 1
- if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') {
+ if (t[0] == 'h' && t[1] >= '1' && t[1] <= '6' && t[2] == '\0') {
if (t[1] >= '3')
for (i = 0; i < termwidth; i++)
putchar('-');
t@@ -671,7 +657,95 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
#endif
}
- curnode--;
+}
+
+static void
+xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
+{
+ struct tag *found;
+ int i;
+
+ /* ignore closing of void elements, like </br>, which is not allowed */
+ if ((found = findtag(t))) {
+ if (!isshort && found->isvoid)
+ return;
+ }
+
+ /* if the current closing tag matches the current open tag */
+ if (!strcasecmp(nodes[curnode].tag.name, t)) {
+ tagend(&nodes[curnode]);
+ if (curnode)
+ curnode--;
+ } else {
+ /* ... else lookup the first matching start tag. This is also
+ for handling optional closing tags */
+ for (i = curnode; i > 0; i--) {
+ if (!strcasecmp(nodes[i].tag.name, t)) {
+ tagend(&nodes[i]);
+ curnode = i;
+ break;
+ }
+ }
+ if (curnode)
+ curnode--;
+ }
+}
+
+/* check if the specified tag is closed at some point in the current tree */
+static int
+istagclosed(int cur)
+{
+ int i;
+
+ if (!cur)
+ return 0;
+ for (i = cur - 1; i > 0; i--) {
+ if (!strcasecmp(nodes[i].tag.name, nodes[cur].tag.name))
+ return 0;
+ }
+ return 1;
+}
+
+static void
+xmltagstart(XMLParser *x, const char *t, size_t tl)
+{
+ struct tag *found;
+ struct node *cur, *parent;
+ char *s;
+
+ if (curnode >= MAX_DEPTH - 2)
+ errx(1, "max tag depth reached: %d\n", curnode);
+ parent = &nodes[curnode];
+ curnode++;
+
+ cur = &nodes[curnode];
+ memset(cur, 0, sizeof(*cur));
+ /* tag defaults */
+ cur->tag.displaytype = DisplayInline;
+ cur->tag.name = cur->tagname;
+ strlcpy(cur->tagname, t, sizeof(cur->tagname));
+ /* to lowercase */
+ for (s = cur->tagname; *s; s++)
+ *s = tolower((unsigned char)*s);
+
+ /* match tag */
+ if ((found = findtag(t))) {
+ cur->nchildren = 0;
+ memcpy(&(cur->tag), found, sizeof(*found));
+
+ if (cur->tag.isoptional && curnode && !istagclosed(curnode)) {
+ /* if it's an unclosed tag and it has parent (like ol, ul)
+ then fake the end tag. */
+ tagend(&nodes[curnode]);
+ }
+
+ /* parent tag is hidden, so hide ourself too */
+ if (parent->tag.displaytype & DisplayNone)
+ cur->tag.displaytype |= DisplayNone;
+ return;
+ }
+
+ src[0] = '\0'; /* reset src, href */
}
static void