utf8.[ch]: imported utf8len() and utf8check() - iomenu - interactive terminal-based selection menu HTML git clone git://bitreich.org/iomenu git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/iomenu DIR Log DIR Files DIR Refs DIR Tags DIR README DIR LICENSE --- DIR commit c383bef4af98a82331c2c6e56ff7964e3dbf690b DIR parent 09d7cdbc37907c01400e2193f4eafba74736aa7d HTML Author: Josuah Demangeon <josuah.demangeon@gandi.net> Date: Tue, 22 Aug 2017 19:48:53 +0200 utf8.[ch]: imported utf8len() and utf8check() Diffstat: M iomenu.c | 37 ++++++++++++++++--------------- D utf.c | 332 ------------------------------- D utf.h | 18 ------------------ A utf8.c | 142 +++++++++++++++++++++++++++++++ A utf8.h | 5 +++++ 5 files changed, 166 insertions(+), 368 deletions(-) --- DIR diff --git a/iomenu.c b/iomenu.c @@ -12,9 +12,9 @@ #define CONTINUE 2 /* as opposed to EXIT_SUCCESS and EXIT_FAILURE */ -#define CONTROL(char) (char ^ 0x40) +#define CTL(char) (char ^ 0x40) #define ALT(char) (char + 0x80) -#define ESC(char) (char + 0x80 + 0x80) +#define CSI(char) (char + 0x80 + 0x80) #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) static struct winsize ws; @@ -74,7 +74,7 @@ resetterminal(void) int i; /* clear terminal */ - for (i = 0; i < opt['l'] + 1; i++) + for (i = 0; i < rows + 1; i++) fputs("\r\033[K\n", stderr); /* reset cursor position */ @@ -322,61 +322,61 @@ key(void) top: switch (key) { - case CONTROL('C'): + case CTL('C'): return EXIT_FAILURE; - case CONTROL('U'): + case CTL('U'): input[0] = '\0'; filter(); break; - case CONTROL('W'): + case CTL('W'): removeword(); break; case 127: - case CONTROL('H'): /* backspace */ + case CTL('H'): /* backspace */ input[strlen(input) - 1] = '\0'; filter(); break; - case ESC('A'): /* up */ - case CONTROL('P'): + case CSI('A'): /* up */ + case CTL('P'): move(-1); break; - case ESC('B'): /* down */ - case CONTROL('N'): + case CSI('B'): /* down */ + case CTL('N'): move(+1); break; - case ESC('5'): + case CSI('5'): /* page up */ if (fgetc(stdin) != '~') break; /* FALLTHROUGH */ case ALT('v'): movepg(-1); break; - case ESC('6'): + case CSI('6'): /* page down */ if (fgetc(stdin) != '~') break; /* FALLTHROUGH */ - case CONTROL('V'): + case CTL('V'): movepg(+1); break; - case CONTROL('I'): /* tab */ + case CTL('I'): /* tab */ if (linec > 0) strcpy(input, matchv[current]); filter(); break; - case CONTROL('J'): /* enter */ - case CONTROL('M'): + case CTL('J'): /* enter */ + case CTL('M'): printselection(); return EXIT_SUCCESS; case ALT('['): - key = ESC(fgetc(stdin)); + key = CSI(fgetc(stdin)); goto top; case 033: /* escape / alt */ @@ -464,6 +464,7 @@ main(int argc, char *argv[]) input[0] = '\0'; while ((exitcode = key()) == CONTINUE) printscreen(); + printscreen(); resetterminal(); close(ttyfd); DIR diff --git a/utf.c b/utf.c @@ -1,332 +0,0 @@ -/* - * Functions handling UTF-8 strings: - * - * stdin -> buffer -> stdout - * UTF-8 -> rune -> UTF-8 - * char[] -> long[] -> char[] - * - * Thanks to Connor Lane Smith for the idea of combining switches and - * binary masks. - */ - - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> - -#include "utf.h" - - -/* --- lengths -------------------------------------------------------------- */ - - -/* - * Return the number of bytes in rune for the `n` next char in `s`, - * or 0 if ti is misencoded. - */ -int -utflen(char *s, int n) -{ - int len = 1; - int continuation_bytes = - (s[0] & 0x80) == 0x00 ? 0 : /* 0xxxxxxx */ - (s[0] & 0xc0) == 0x80 ? 1 : /* 10xxxxxx */ - (s[0] & 0xe0) == 0xc0 ? 2 : /* 110xxxxx */ - (s[0] & 0xf0) == 0xe0 ? 3 : /* 1110xxxx */ - (s[0] & 0xf8) == 0xf0 ? 4 : /* 11110xxx */ - (s[0] & 0xfc) == 0xf8 ? 5 : /* 111110xx */ - (s[0] & 0xfe) == 0xfc ? 6 : /* 1111110x */ - (s[0] & 0xff) == 0xfe ? 7 : /* 11111110 */ - 8; /* 11111111 */ - - if (continuation_bytes > 6 || continuation_bytes > n) - return 0; - - /* check if continuation bytes are 10xxxxxx and increment `len` */ - switch (continuation_bytes) { /* FALLTHROUGH */ - case 7: if ((s[6] & 0xc0) != 0x80) return 0; else len++; - case 6: if ((s[5] & 0xc0) != 0x80) return 0; else len++; - case 5: if ((s[4] & 0xc0) != 0x80) return 0; else len++; - case 4: if ((s[3] & 0xc0) != 0x80) return 0; else len++; - case 3: if ((s[2] & 0xc0) != 0x80) return 0; else len++; - case 2: if ((s[1] & 0xc0) != 0x80) return 0; else len++; - case 0: return len; - default: return 0; - } -} - - -/* - * Return the number of bytes required to encode `rune` into UTF-8, or - * 0 if rune is too long. - */ -int -runelen(long r) -{ - if (r <= 0x0000007f) return 1; - if (r <= 0x000007ff) return 2; - if (r <= 0x0000ffff) return 3; - if (r <= 0x001fffff) return 4; - if (r <= 0x03ffffff) return 5; - if (r <= 0x7fffffff) return 6; - return 0; -} - - -/* --- conversions ---------------------------------------------------------- */ - - -/* - * Sets `r` to a rune corresponding to the firsts `n` bytes of `s`. - * If `s` is misencoded, the rune is stored as a negative value. - * - * Return the number of bytes read. - */ -int -utftorune(long *r, char *s, int n) -{ - int len = utflen(s, n), i; - - /* first byte */ - switch (len) { - case 1: *r = s[0]; return 1; /* 0xxxxxxx */ - case 2: *r = s[0] & 0x1f; break; /* 110xxxxx */ - case 3: *r = s[0] & 0x0f; break; /* 1110xxxx */ - case 4: *r = s[0] & 0x07; break; /* 11110xxx */ - case 5: *r = s[0] & 0x03; break; /* 111110xx */ - case 6: *r = s[0] & 0x01; break; /* 1111110x */ - default: *r = -(unsigned char) s[0]; return 1; /* misencoded */ - } - - /* continuation bytes */ - for (i = 1; i < len; i++) - *r = (*r << 6) | (s[i] & 0x3f); /* 10xxxxxx */ - - /* overlong sequences */ - if (runelen(*r) != len) { - *r = -(unsigned char) s[0]; - return 1; - } - - return len; -} - - -/* - * Convert the utf char sring `src` of size `n` to a long string - * `dest`. - * - * Return the length of `i`. - */ -size_t -utftorunes(long *runes, char *utf, size_t n) -{ - size_t i, j; - - for (i = 0, j = 0; n > 0; i++) - j += utftorune(runes + i, utf + j, n - j); - - runes[i] = '\0'; - return i; -} - - -/* - * Encode the rune `r` in utf-8 in `s`, null-terminated. - * - * Return the number of bytes written, 0 if `r` is invalid. - */ -int -runetoutf(char *s, long r) -{ - switch (runelen(r)) { - case 1: - s[0] = r; /* 0xxxxxxx */ - s[1] = '\0'; - return 1; - case 2: - s[0] = 0xc0 | (0x1f & (r >> 6)); /* 110xxxxx */ - s[1] = 0x80 | (0x3f & (r)); /* 10xxxxxx */ - s[2] = '\0'; - return 2; - case 3: - s[0] = 0xe0 | (0x0f & (r >> 12)); /* 1110xxxx */ - s[1] = 0x80 | (0x3f & (r >> 6)); /* 10xxxxxx */ - s[2] = 0x80 | (0x3f & (r)); /* 10xxxxxx */ - s[3] = '\0'; - return 3; - case 4: - s[0] = 0xf0 | (0x07 & (r >> 18)); /* 11110xxx */ - s[1] = 0x80 | (0x3f & (r >> 12)); /* 10xxxxxx */ - s[2] = 0x80 | (0x3f & (r >> 6)); /* 10xxxxxx */ - s[3] = 0x80 | (0x3f & (r)); /* 10xxxxxx */ - s[4] = '\0'; - return 4; - case 5: - s[0] = 0xf8 | (0x03 & (r >> 24)); /* 111110xx */ - s[1] = 0x80 | (0x3f & (r >> 18)); /* 10xxxxxx */ - s[2] = 0x80 | (0x3f & (r >> 12)); /* 10xxxxxx */ - s[3] = 0x80 | (0x3f & (r >> 6)); /* 10xxxxxx */ - s[4] = 0x80 | (0x3f & (r)); /* 10xxxxxx */ - s[5] = '\0'; - return 5; - case 6: - s[0] = 0xfc | (0x01 & (r >> 30)); /* 1111110x */ - s[1] = 0x80 | (0x3f & (r >> 24)); /* 10xxxxxx */ - s[2] = 0x80 | (0x3f & (r >> 18)); /* 10xxxxxx */ - s[3] = 0x80 | (0x3f & (r >> 12)); /* 10xxxxxx */ - s[4] = 0x80 | (0x3f & (r >> 6)); /* 10xxxxxx */ - s[5] = 0x80 | (0x3f & (r)); /* 10xxxxxx */ - s[6] = '\0'; - return 6; - default: - s[0] = '\0'; - return 0; - } -} - - -/* - * Fill `s` with a printable representation of `r`. - * - * Return the width of the character. - */ -int -runetoprint(char *s, long r) -{ - if (r < 0) { - return sprintf(s, "[%02x]", (unsigned char) -r); - - } else if (r == 0x7f || r < ' ') { - return sprintf(s, "[%02lx]", r); - - } else if (!runeisprint(r)) { - return sprintf(s, "[%04lx]", r); - - } else { - runetoutf(s, r); - return 1; - } - - return 0; -} - - -/* --- standard library ----------------------------------------------------- */ - - -/* - * Returns 1 if the rune is a printable character and 0 if not. - */ -int -runeisprint(long r) -{ - return !( - (r != '\t' && r < ' ') || /* ascii control */ - (r == 0x7f) || - - (0x80 <= r && r < 0xa0) || /* unicode control */ - - (r > 0x10ffff) || /* outside range */ - - ((r & 0x00fffe) == 0x00fffe) || /* noncharacters */ - (0x00fdd0 <= r && r <= 0x00fdef) || - - (0x00e000 <= r && r <= 0x00f8ff) || /* private use */ - (0x0f0000 <= r && r <= 0x0ffffd) || - (0x100000 <= r && r <= 0x10fffd) || - - (0x00d800 <= r && r <= 0x00dfff) /* surrogates */ - ); -} - - -/* - * Read an utf string from `f` up to the first '\n' character or the - * end of the file. It is stored as a rune array into the newly - * allocated `r`. - * - * Return the length of `r`, or -1 if malloc fails or if the end of - * `f` is reached. - */ -size_t -getrunes(long **r, FILE *f) -{ - size_t slen, rlen = 0, size = BUFSIZ, i; - int c; - char *s; - - if (!(s = malloc(size))) return -1; - for (slen = 0; (c = fgetc(f)) != EOF && (c != '\n'); slen++) { - if (slen > size && !(s = realloc(s, ++size))) return -1; - s[slen] = c; - } - - if (!(*r = malloc(size * sizeof (long)))) return -1; - for (i = 0; i < slen; rlen++) - i += utftorune(*r + rlen, s + i, slen - i); - (*r)[rlen] = '\0'; - - free(s); - if (feof(f)) return -1; else return rlen; -} - - -long * -runescpy(long *dest, long *src) -{ - size_t i; - - for (i = 0; src[i] != '\0'; i++) - dest[i] = src[i]; - dest[i] = '\0'; - - return dest; -} - - -long * -runeschr(long *s, long r) -{ - size_t i; - - for (i = 0; s[i] != '\0'; i++) - if (s[i] == r) return s + i; - - return NULL; -} - - -long * -runescat(long *s1, long *s2) -{ - size_t i, j; - - for (i = 0; s1[i] != '\0'; i++); - for (j = 0; s2[j] != '\0'; j++) - s1[i + j] = s2[j]; - s1[i + j] = '\0'; - - return s1; -} - - -int -main() -{ - char s[BUFSIZ]; - long *r; - int len, i; - - for (len = 0; (len = getrunes(&r, stdin)) >= 0 && !feof(stdin); free(r)) { - for (i = 0; i < len; i++) { - runetoprint(s, r[i]); - fputs(s, stdout); - } - - putchar('\n'); - } - free(r); - - return 0; -} DIR diff --git a/utf.h b/utf.h @@ -1,18 +0,0 @@ -/* lengths */ -int utflen(char *, int); -int runelen(long); - -/* conversions */ -int utftorune(long *, char *, int); -int utftorune(long *, char *, int); -int runetoutf(char *, long); -int runetoprint(char *, long); - - -/* standard library */ - -int runeisprint(long); -size_t getrunes(long **, FILE *); -long * runescpy(long *, long *); -long * runeschr(long *, long); -long * runescat(long *, long *); DIR diff --git a/utf8.c b/utf8.c @@ -0,0 +1,142 @@ +/* + * ASCII all have a leading '0' byte: + * + * 0xxxxxxx + * + * UTF-8(7) have one leading '1' and as many following '1' as there are + * continuation bytes (with leading '1' and '0'). + * + * 0xxxxxxx + * 110xxxxx 10xxxxxx + * 1110xxxx 10xxxxxx 10xxxxxx + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * There is up to 3 continuation bytes -- up to 4 bytes per runes. + * + * The whole character value is retreived into an 'x' and stored into a + * (long)[]. + * + * Thanks to Connor Lane Smith for the idea of combining switches and + * binary masks. + */ + +#include <stdlib.h> +#include <stdio.h> + +#include "utf8.h" + + +/* + * Return the number of bytes in rune for the `n` next char in `s`, + * or 0 if ti is misencoded. + */ +size_t +utf8len(char *s, int n) +{ + unsigned char *sp = (unsigned char *) s; + int i, len = (*sp < 0x80) ? 1 : /* 0xxxxxxx < 10000000 */ + (*sp < 0xc0) ? 0 : /* 10xxxxxx < 11000000 */ + (*sp < 0xe0) ? 2 : /* 110xxxxx < 11100000 */ + (*sp < 0xf0) ? 3 : /* 1110xxxx < 11110000 */ + (*sp < 0xf8) ? 4 : /* 11110xxx < 11111000 */ + (*sp < 0xfc) ? 5 : /* 111110xx < 11111100 */ + (*sp < 0xfe) ? 6 : /* 1111110x < 11111110 */ + (*sp < 0xff) ? 7 : /* 11111110 < 11111111 */ + 0; + if (len > n) return 0; + + /* check continuation bytes */ + for (sp++, i = 1; i < len; i++, sp++) + if ((*sp & 0xc0) != 0x80) /* 10xxxxxx & 11000000 */ + return 0; + + return len; +} + + +/* + * Return the number of bytes required to encode `rune` into UTF-8, or + * 0 if rune is too long. + */ +size_t +utf8runelen(long r) +{ + return (r <= 0x0000007f) ? 1 : (r <= 0x000007ff) ? 2 : + (r <= 0x0000ffff) ? 3 : (r <= 0x001fffff) ? 4 : + (r <= 0x03ffffff) ? 5 : (r <= 0x7fffffff) ? 6 : 0; +} + + +/* + * Sets 'r' to a rune corresponding to the firsts 'n' bytes of 's'. + * + * Return the number of bytes read or 0 if the string is misencoded. + */ +size_t +utf8torune(long *r, char *s, size_t n) +{ + char mask[] = { 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; + size_t i, len = utf8len(s, n); + + if (len == 0 || len > 6 || len > n) + return 0; + + /* first byte */ + *r = *s++ & mask[len - 1]; + + /* continuation bytes */ + for (i = 1; i < len; i++) + *r = (*r << 6) | (*s++ & 0x3f); /* 10xxxxxx */ + + /* overlong sequences */ + if (utf8runelen(*r) != len) + return 0; + + return len; +} + + +/* + * Returns 1 if the rune is a valid unicode code point and 0 if not. + */ +int +utf8runeisunicode(long r) +{ + return !( + (r > 0x10ffff) || /* outside range */ + + ((r & 0x00fffe) == 0x00fffe) || /* noncharacters */ + (0x00fdd0 <= r && r <= 0x00fdef) || + + (0x00e000 <= r && r <= 0x00f8ff) || /* private use */ + (0x0f0000 <= r && r <= 0x0ffffd) || + (0x100000 <= r && r <= 0x10fffd) || + + (0x00d800 <= r && r <= 0x00dfff) /* surrogates */ + ); +} + + +/* + * Return 1 if '*s' is correctly encoded in UTF-8 with allowed Unicode + * code points. + */ +int +utf8check(char *s, size_t len) +{ + size_t shift; + long r = 0; + + while (len > 0) { + shift = utf8torune(&r, s, len); + if (!shift || !utf8runeisunicode(r)) + return 0; + + s += shift; + len -= shift; + } + + return 1; +} DIR diff --git a/utf8.h b/utf8.h @@ -0,0 +1,5 @@ +size_t utf8len(char *, int); +size_t utf8runelen(long); +size_t utf8torune(long *, char *, size_t); +int utf8runeisunicode(long); +int utf8check(char *, size_t);