--- unhtml.bak 2024-07-14 17:22:49.000000000 -0700
+++ unhtml.c 2024-07-16 07:02:48.719828785 -0700
@@ -20,8 +20,8 @@
typedef struct {
char in[7];
- char out1d; /* DOS character (USA codepage) */
- char out1w; /* Windows character */
+ unsigned char out1d; /* DOS character (USA codepage) */
+ unsigned char out1w; /* Windows character */
char out2[4]; /* ASCII substitute */
char use2; /* 1- use out2 instead of out1d for dos2flag
2- diacritical marked character
@@ -58,7 +58,8 @@
{"#167", 21, 167, "%"},
{"uml", '"', 168, "\""},
{"#168", '"', 168, "\""},
- {"cright", 'C', 169, "(C)",1},
+ {"COPY", 'C', 169, "(C)",1},
+ {"copy", 'C', 169, "(C)",1},
{"#169", 'C', 169, "(C)",1},
{"ordf", 166, 170, "a"},
{"#170", 166, 170, "a"},
@@ -173,6 +174,8 @@
{{0},0,0,{0}}
};
+/* the longest name above */
+#define MAX_SUB 6
void newline(void) {
@@ -208,13 +211,24 @@
}
void mygetchar(void) {
+ int space = 0;
for (;;) {
ch = getchar();
- if (ch == '\n' && !quoting) ch = ' '; /* convert to whitespace */
if (ch == EOF) {
cnewline();
exit(0);
}
+ if (!quoting) {
+ if (ch == '\n' || ch == '\t') ch = ' '; /* convert to whitespace */
+ if (ch == ' ') {
+ space = 1; /* consolidate multiple spaces */
+ continue;
+ }
+ if (space) {
+ ungetc(ch, stdin);
+ ch = ' ';
+ }
+ }
return;
}
}
@@ -253,13 +267,14 @@
void main(int argc, char **argv) {
int notflag=0, intitle=0;
- char cmdbuf[20];
+ #define CMDBUF_SIZE 32
+ char cmdbuf[CMDBUF_SIZE];
int listlevel = -1; /* not in a list */
int listcount[10]; /* current counter value at each list level */
int i;
char *arglist;
- fprintf(stderr, "HTML removing filter Version 1.0\n"
+ fprintf(stderr, "HTML removing filter Version 1.0c\n"
"Copyright 1996 by Tom Almy\n");
if (argc > 2) usage();
@@ -296,30 +311,37 @@
/* special character processing */
mygetchar();
i=0;
- while (ch != ';' && i < 12) {
+ while (ch != ';' && i < CMDBUF_SIZE - 1) {
cmdbuf[i++] = ch;
mygetchar();
}
+ if (intitle) continue;
cmdbuf[i] = 0;
- if (i > 10) {
- /* bad &; field, should not occur, but I've seen them! */
- if (!intitle) {
- printf("&%s%c", cmdbuf, ch);
+ if (*cmdbuf == '#') {
+ if (cmdbuf[1] == 'x') {
+ i = (int)strtol(cmdbuf + 2, 0, 16);
+ } else {
+ i = (int)strtol(cmdbuf + 1, 0, 10);
+ }
+ if (i < 128) {
+ putchar(i);
startline = 0;
+ continue;
}
- continue;
}
- i = 0;
- while (a[i].in) {
- if (strcmp(a[i].in,cmdbuf)==0) {
- if (!intitle) {
+ if (i <= MAX_SUB) {
+ i = 0;
+ while (*a[i].in) {
+ if (strcmp(a[i].in,cmdbuf)==0) {
putTableChar(i);
- startline = 0;
+ i = 0;
+ break;
}
- break;
+ i++;
}
- i++;
}
+ if (i) printf("&%s%c", cmdbuf, ch);
+ startline = 0;
continue;
}
/* process <> command */
@@ -330,7 +352,7 @@
mygetchar();
}
i=0;
- while (ch != ' ' && ch != '>') {
+ while (!isspace(ch) && ch != '>' && i < CMDBUF_SIZE - 1) {
cmdbuf[i++] = ch;
mygetchar();
}
@@ -391,7 +413,9 @@
}
if (strcmp("pre", cmdbuf)==0) {
/* preformatted */
- if (!notflag) cnewline();
+ cnewline();
+ newline();
+ if (notflag) skipws = 1;
quoting = !notflag;
continue;
}