utf8pad: improve padded printing and printing invalid unicode characters - stagit-gopher - A git gopher frontend. (mirror) HTML git clone git://bitreich.org/stagit-gopher/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/stagit-gopher/ DIR Log DIR Files DIR Refs DIR Tags DIR README DIR LICENSE --- DIR commit 554a9fe2e9d12defd9d6253871d8261d3f3ef3c6 DIR parent 7b93d02cd8f26ab9a25d967c72c359a22c91eb74 HTML Author: Hiltjo Posthuma <hiltjo@codemadness.org> Date: Sat, 9 Jan 2021 14:56:51 +0100 utf8pad: improve padded printing and printing invalid unicode characters - Use unicode replacement character (codepoint 0xfffd) when a codepoint is invalid and proceed printing the rest of the characters. - When a codepoint is invalid reset the internal state of mbtowc(3), from the OpenBSD man page: " If a call to mbtowc() resulted in an undefined internal state, mbtowc() must be called with s set to NULL to reset the internal state before it can safely be used again." - Make the function return 0 when `len` is 0 (this should not be not an error). Diffstat: M stagit-gopher-index.c | 59 ++++++++++++++++++++++--------- M stagit-gopher.c | 58 ++++++++++++++++++++++--------- 2 files changed, 83 insertions(+), 34 deletions(-) --- DIR diff --git a/stagit-gopher-index.c b/stagit-gopher-index.c @@ -10,6 +10,9 @@ #include <git2.h> +#define PAD_TRUNCATE_SYMBOL "\xe2\x80\xa6" /* symbol: "ellipsis" */ +#define UTF_INVALID_SYMBOL "\xef\xbf\xbd" /* symbol: "replacement" */ + static git_repository *repo; static const char *relpath = ""; @@ -17,40 +20,62 @@ static const char *relpath = ""; static char description[255] = "Repositories"; static char *name = ""; -/* format `len' columns of characters. If string is shorter pad the rest +/* Format `len' columns of characters. If string is shorter pad the rest * with characters `pad`. */ int utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad) { wchar_t wc; size_t col = 0, i, slen, siz = 0; - int rl, w; + int inc, rl, w; - if (!len) + if (!bufsiz) return -1; + if (!len) { + buf[0] = '\0'; + return 0; + } slen = strlen(s); - for (i = 0; i < slen; i += rl) { - if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0) - break; - if ((w = wcwidth(wc)) == -1) + for (i = 0; i < slen; i += inc) { + inc = 1; + if ((unsigned char)s[i] < 32) continue; - if (col + w > len || (col + w == len && s[i + rl])) { + + rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4); + if (rl < 0) { + mbtowc(NULL, NULL, 0); /* reset state */ + inc = 1; /* next byte */ + w = 1; /* replacement char is one width */ + } else if ((w = wcwidth(wc)) == -1) { + continue; + } else { + inc = rl; + } + + if (col + w > len || (col + w == len && s[i + inc])) { if (siz + 4 >= bufsiz) return -1; - memcpy(&buf[siz], "\xe2\x80\xa6", 3); - siz += 3; - if (col + w == len && w > 1) - buf[siz++] = pad; + memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUNCATE_SYMBOL) - 1); + siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1; buf[siz] = '\0'; - return 0; + col++; + break; + } else if (rl < 0) { + if (siz + 4 >= bufsiz) + return -1; + memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVALID_SYMBOL) - 1); + siz += sizeof(UTF_INVALID_SYMBOL) - 1; + buf[siz] = '\0'; + col++; + continue; } - if (siz + rl + 1 >= bufsiz) + if (siz + inc + 1 >= bufsiz) return -1; - memcpy(&buf[siz], &s[i], rl); - col += w; - siz += rl; + memcpy(&buf[siz], &s[i], inc); + siz += inc; buf[siz] = '\0'; + col += w; } len -= col; DIR diff --git a/stagit-gopher.c b/stagit-gopher.c @@ -19,6 +19,8 @@ #include "compat.h" #define LEN(s) (sizeof(s)/sizeof(*s)) +#define PAD_TRUNCATE_SYMBOL "\xe2\x80\xa6" /* symbol: "ellipsis" */ +#define UTF_INVALID_SYMBOL "\xef\xbf\xbd" /* symbol: "replacement" */ struct deltainfo { git_patch *patch; @@ -80,40 +82,62 @@ static char lastoidstr[GIT_OID_HEXSZ + 2]; /* id + newline + NUL byte */ static FILE *rcachefp, *wcachefp; static const char *cachefile; -/* format `len' columns of characters. If string is shorter pad the rest +/* Format `len' columns of characters. If string is shorter pad the rest * with characters `pad`. */ int utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad) { wchar_t wc; size_t col = 0, i, slen, siz = 0; - int rl, w; + int inc, rl, w; - if (!len) + if (!bufsiz) return -1; + if (!len) { + buf[0] = '\0'; + return 0; + } slen = strlen(s); - for (i = 0; i < slen; i += rl) { - if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0) - break; - if ((w = wcwidth(wc)) == -1) + for (i = 0; i < slen; i += inc) { + inc = 1; + if ((unsigned char)s[i] < 32) + continue; + + rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4); + if (rl < 0) { + mbtowc(NULL, NULL, 0); /* reset state */ + inc = 1; /* next byte */ + w = 1; /* replacement char is one width */ + } else if ((w = wcwidth(wc)) == -1) { continue; - if (col + w > len || (col + w == len && s[i + rl])) { + } else { + inc = rl; + } + + if (col + w > len || (col + w == len && s[i + inc])) { if (siz + 4 >= bufsiz) return -1; - memcpy(&buf[siz], "\xe2\x80\xa6", 3); - siz += 3; - if (col + w == len && w > 1) - buf[siz++] = pad; + memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUNCATE_SYMBOL) - 1); + siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1; buf[siz] = '\0'; - return 0; + col++; + break; + } else if (rl < 0) { + if (siz + 4 >= bufsiz) + return -1; + memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVALID_SYMBOL) - 1); + siz += sizeof(UTF_INVALID_SYMBOL) - 1; + buf[siz] = '\0'; + col++; + continue; } - if (siz + rl + 1 >= bufsiz) + if (siz + inc + 1 >= bufsiz) return -1; - memcpy(&buf[siz], &s[i], rl); - col += w; - siz += rl; + memcpy(&buf[siz], &s[i], inc); + siz += inc; buf[siz] = '\0'; + col += w; } len -= col;