twip - webdump - [FORK] git://git.codemadness.org/webdump
HTML git clone git://git.z3bra.org/webdump.git
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
DIR commit 114efd43e79a417abbda2e8c427d9dd57b482bce
DIR parent ea14e82082be78917aaa6e380879c0e230330b47
HTML Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Mon, 24 Jul 2017 10:06:48 +0200
wip
Diffstat:
A TODO | 1 +
M main.c | 110 +++++++++++++++++++++++++++----
2 files changed, 97 insertions(+), 14 deletions(-)
---
DIR diff --git a/TODO b/TODO
t@@ -0,0 +1 @@
+? xml.c: make sure to always call xmldata handler even if datalen == 0 ?
DIR diff --git a/main.c b/main.c
t@@ -1,3 +1,6 @@
+/* TODO: escape control characters */
+/* TODO: specify and parse relative url */
+
#include <ctype.h>
#include <err.h>
#include <stdint.h>
t@@ -76,9 +79,12 @@ static char *blocktags[] = {
"title",
"tr",
"table",
+ "code",
+ "blockquote",
};
static String htmldata;
+static String preprocess;
/* Clear string only; don't free, prevents unnecessary reallocation. */
static void
t@@ -115,6 +121,28 @@ string_append(String *s, const char *data, size_t len)
s->data[s->len] = '\0';
}
+#if 0
+static void
+safeprint(const char *s, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len && *s; i++) {
+ switch (*s) {
+ case '\t':
+ case '\n':
+ putchar(*s);
+ break;
+ default:
+ if (iscntrl(*s))
+ putchar(' ');
+ else
+ putchar(*s);
+ }
+ }
+}
+#endif
+
static void
xmlcdata(XMLParser *p, const char *data, size_t datalen)
{
t@@ -128,31 +156,41 @@ xmldataend(XMLParser *p)
char *start, *s, *e;
cur = &nodes[curnode];
+ if (!htmldata.data || !htmldata.len)
+ return;
start = htmldata.data;
- for (s = start; *s; s++)
+#if 1
+ s = start;
+ e = s + strlen(s);
+#else
+ for (s = start; *s; s++) {
if (*s != '\r' && *s != '\n')
break;
+ }
- e = s + strlen(s);
- for (; e > s; e--)
+ for (e = s + strlen(s); e > s; e--) {
if (*e != '\r' && *e != '\n')
break;
+ }
+#endif
if (cur->ispre) {
fwrite(s, 1, e - s, stdout);
} else {
+#if 0
for (; s < e; s++) {
- if (!isspace(*s))
- break;
- }
- for (; s < e; s++) {
- if (!isspace(*s)) {
- if (s != start && isspace(s[-1]))
+ if (isspace(*s)) {
+ if (s != start && !isspace(s[-1]))
putchar(' ');
+ } else {
putchar(*s);
}
}
+ if (s != start && e != start && !isspace(s[-1]) && isspace(e[-1]))
+ putchar(' ');
+#endif
+ printf("DEBUG: |%s|\n", start);
}
string_clear(&htmldata);
t@@ -164,10 +202,9 @@ xmldata(XMLParser *p, const char *data, size_t datalen)
struct node *cur;
cur = &nodes[curnode];
+ string_append(&htmldata, data, datalen);
if (cur->isignore)
return;
-
- string_append(&htmldata, data, datalen);
}
static void
t@@ -239,6 +276,9 @@ xmltagend(XMLParser *p, const char *tag, size_t taglen, int isshort)
src[0] = '\0';
#endif
+ if (!strcmp(tag, "tr"))
+ fputs(" | ", stdout); /* HACK */
+
if (cur->isblock)
fputs("\n", stdout);
t@@ -266,6 +306,9 @@ xmltagstartparsed(XMLParser *p, const char *tag, size_t taglen, int isshort)
if (cur->isblock)
fputs("\n", stdout);
+ if (!strcmp(tag, "td"))
+ fputs(" | ", stdout); /* HACK */
+
if (!strcmp(cur->tag, "li")) {
/* indent nested list items */
for (i = curnode; i; i--) {
t@@ -295,18 +338,57 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
if (!strcmp(tag, "a") && !strcmp(name, "href") && valuelen)
strlcpy(src, value, sizeof(src));
- /* TODO: check alt and title attr also? */
- if (!strcmp(tag, "img") && !strcmp(name, "src") && valuelen)
+ if ((!strcmp(tag, "img") || !strcmp(tag, "video") || !strcmp(tag, "audio")) &&
+ !strcmp(name, "src") && valuelen)
strlcpy(src, value, sizeof(src));
}
+static size_t read_offset;
+
+int
+readchar(void)
+{
+ size_t i, j;
+ int c;
+
+ for (; readoffset < preprocess.len; ) {
+ if (preprocess.data[read_offset] != '<')
+ return preprocess.data[read_offset++];
+
+ for (j = 0; j < sizeof(ignoretags) / sizeof(*ignoretags); j++) {
+ if (!strncmp(&preprocess.data[i + 1], ignoretags[i], sizeof(ignoretags[i]) - 1)) {
+ if (strchr(" \t>", preprocess.data[i + 1 + sizeof(ignoretags[i]) - 1])) {
+ /* TODO: search until end of this tag */
+ }
+ }
+ }
+ /* TODO: if no match just return char */
+ return preprocess.data[read_offset++];
+ }
+ return EOF;
+}
+
/* TODO: preprocess data, strip <script>, <style> etc */
int
main(void)
{
+
+ char buf[BUFSIZ];
+ int n;
+
if (pledge("stdio", NULL) < 0)
err(1, "pledge");
+ /* TODO: optimize later */
+ while (1) {
+ /* TODO: check read error */
+ n = read(0, buf, sizeof(buf) - 1);
+ if (n <= 0)
+ break;
+ buf[n] = '\0';
+ string_append(&preprocess, buf, n);
+ }
+
parser.xmlattr = xmlattr;
parser.xmlcdata = xmlcdata;
parser.xmldata = xmldata;
t@@ -316,7 +398,7 @@ main(void)
parser.xmltagend = xmltagend;
parser.xmltagstartparsed = xmltagstartparsed;
- parser.getnext = getchar;
+ parser.getnext = readchar;
xml_parse(&parser);
putchar('\n');