utf8.c - ltkx - GUI toolkit for X11 (old)
HTML git clone git://lumidify.org/ltkx.git (fast, but not encrypted)
HTML git clone https://lumidify.org/ltkx.git (encrypted, but very slow)
HTML git clone git://4kcetb7mo7hj6grozzybxtotsub5bempzo4lirzc3437amof2c2impyd.onion/ltkx.git (over tor)
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
utf8.c (18333B)
---
1 /*
2 Basic UTF-8 manipulation routines
3 by Jeff Bezanson
4 placed in the public domain Fall 2005
5
6 This code is designed to provide the utilities you need to manipulate
7 UTF-8 as an internal string encoding. These functions do not perform the
8 error checking normally needed when handling UTF-8 data, so if you happen
9 to be from the Unicode Consortium you will want to flay me alive.
10 I do this because error checking can be performed at the boundaries (I/O),
11 with these routines reserved for higher performance on data known to be
12 valid.
13 A UTF-8 validation routine is included.
14 */
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <string.h>
18 #include <stdarg.h>
19 #include <stdint.h>
20 #include <wchar.h>
21 #include <wctype.h>
22
23 #ifdef WIN32
24 #include <malloc.h>
25 #define snprintf _snprintf
26 #else
27 #ifndef __FreeBSD__
28 #include <alloca.h>
29 #endif /* __FreeBSD__ */
30 #endif
31 #include <assert.h>
32
33 #include "utf8.h"
34
35 static const uint32_t offsetsFromUTF8[6] = {
36 0x00000000UL, 0x00003080UL, 0x000E2080UL,
37 0x03C82080UL, 0xFA082080UL, 0x82082080UL
38 };
39
40 static const char trailingBytesForUTF8[256] = {
41 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
42 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
43 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
44 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
45 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
48 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
49 };
50
51 /* returns length of next utf-8 sequence */
52 size_t u8_seqlen(const char *s)
53 {
54 return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
55 }
56
57 /* returns the # of bytes needed to encode a certain character
58 0 means the character cannot (or should not) be encoded. */
59 size_t u8_charlen(uint32_t ch)
60 {
61 if (ch < 0x80)
62 return 1;
63 else if (ch < 0x800)
64 return 2;
65 else if (ch < 0x10000)
66 return 3;
67 else if (ch < 0x110000)
68 return 4;
69 return 0;
70 }
71
72 size_t u8_codingsize(uint32_t *wcstr, size_t n)
73 {
74 size_t i, c=0;
75
76 for(i=0; i < n; i++)
77 c += u8_charlen(wcstr[i]);
78 return c;
79 }
80
81 /* conversions without error checking
82 only works for valid UTF-8, i.e. no 5- or 6-byte sequences
83 srcsz = source size in bytes
84 sz = dest size in # of wide characters
85
86 returns # characters converted
87 if sz == srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
88 */
89 size_t u8_toucs(uint32_t *dest, size_t sz, const char *src, size_t srcsz)
90 {
91 uint32_t ch;
92 const char *src_end = src + srcsz;
93 size_t nb;
94 size_t i=0;
95
96 if (sz == 0 || srcsz == 0)
97 return 0;
98
99 while (i < sz) {
100 if (!isutf(*src)) { // invalid sequence
101 dest[i++] = 0xFFFD;
102 src++;
103 if (src >= src_end) break;
104 continue;
105 }
106 nb = trailingBytesForUTF8[(unsigned char)*src];
107 if (src + nb >= src_end)
108 break;
109 ch = 0;
110 switch (nb) {
111 /* these fall through deliberately */
112 case 5: ch += (unsigned char)*src++; ch <<= 6;
113 case 4: ch += (unsigned char)*src++; ch <<= 6;
114 case 3: ch += (unsigned char)*src++; ch <<= 6;
115 case 2: ch += (unsigned char)*src++; ch <<= 6;
116 case 1: ch += (unsigned char)*src++; ch <<= 6;
117 case 0: ch += (unsigned char)*src++;
118 }
119 ch -= offsetsFromUTF8[nb];
120 dest[i++] = ch;
121 }
122 return i;
123 }
124
125 /* srcsz = number of source characters
126 sz = size of dest buffer in bytes
127
128 returns # bytes stored in dest
129 the destination string will never be bigger than the source string.
130 */
131 size_t u8_toutf8(char *dest, size_t sz, const uint32_t *src, size_t srcsz)
132 {
133 uint32_t ch;
134 size_t i = 0;
135 char *dest0 = dest;
136 char *dest_end = dest + sz;
137
138 while (i < srcsz) {
139 ch = src[i];
140 if (ch < 0x80) {
141 if (dest >= dest_end)
142 break;
143 *dest++ = (char)ch;
144 }
145 else if (ch < 0x800) {
146 if (dest >= dest_end-1)
147 break;
148 *dest++ = (ch>>6) | 0xC0;
149 *dest++ = (ch & 0x3F) | 0x80;
150 }
151 else if (ch < 0x10000) {
152 if (dest >= dest_end-2)
153 break;
154 *dest++ = (ch>>12) | 0xE0;
155 *dest++ = ((ch>>6) & 0x3F) | 0x80;
156 *dest++ = (ch & 0x3F) | 0x80;
157 }
158 else if (ch < 0x110000) {
159 if (dest >= dest_end-3)
160 break;
161 *dest++ = (ch>>18) | 0xF0;
162 *dest++ = ((ch>>12) & 0x3F) | 0x80;
163 *dest++ = ((ch>>6) & 0x3F) | 0x80;
164 *dest++ = (ch & 0x3F) | 0x80;
165 }
166 i++;
167 }
168 return (dest-dest0);
169 }
170
171 size_t u8_wc_toutf8(char *dest, uint32_t ch)
172 {
173 if (ch < 0x80) {
174 dest[0] = (char)ch;
175 return 1;
176 }
177 if (ch < 0x800) {
178 dest[0] = (ch>>6) | 0xC0;
179 dest[1] = (ch & 0x3F) | 0x80;
180 return 2;
181 }
182 if (ch < 0x10000) {
183 dest[0] = (ch>>12) | 0xE0;
184 dest[1] = ((ch>>6) & 0x3F) | 0x80;
185 dest[2] = (ch & 0x3F) | 0x80;
186 return 3;
187 }
188 if (ch < 0x110000) {
189 dest[0] = (ch>>18) | 0xF0;
190 dest[1] = ((ch>>12) & 0x3F) | 0x80;
191 dest[2] = ((ch>>6) & 0x3F) | 0x80;
192 dest[3] = (ch & 0x3F) | 0x80;
193 return 4;
194 }
195 return 0;
196 }
197
198 /* charnum => byte offset */
199 size_t u8_offset(const char *s, size_t charnum)
200 {
201 size_t i=0;
202
203 while (charnum > 0) {
204 if (s[i++] & 0x80) {
205 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
206 }
207 charnum--;
208 }
209 return i;
210 }
211
212 /* byte offset => charnum */
213 size_t u8_charnum(const char *s, size_t offset)
214 {
215 size_t charnum = 0, i=0;
216
217 while (i < offset) {
218 if (s[i++] & 0x80) {
219 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
220 }
221 charnum++;
222 }
223 return charnum;
224 }
225
226 /* number of characters in NUL-terminated string */
227 size_t u8_strlen(const char *s)
228 {
229 size_t count = 0;
230 size_t i = 0, lasti;
231
232 while (1) {
233 lasti = i;
234 while (s[i] > 0)
235 i++;
236 count += (i-lasti);
237 if (s[i++]==0) break;
238 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
239 count++;
240 }
241 return count;
242 }
243
244 int wcwidth(wchar_t c);
245
246 size_t u8_strwidth(const char *s)
247 {
248 uint32_t ch;
249 size_t nb, tot=0;
250 int w;
251 signed char sc;
252
253 while ((sc = (signed char)*s) != 0) {
254 if (sc >= 0) {
255 s++;
256 if (sc) tot++;
257 }
258 else {
259 if (!isutf(sc)) { tot++; s++; continue; }
260 nb = trailingBytesForUTF8[(unsigned char)sc];
261 ch = 0;
262 switch (nb) {
263 /* these fall through deliberately */
264 case 5: ch += (unsigned char)*s++; ch <<= 6;
265 case 4: ch += (unsigned char)*s++; ch <<= 6;
266 case 3: ch += (unsigned char)*s++; ch <<= 6;
267 case 2: ch += (unsigned char)*s++; ch <<= 6;
268 case 1: ch += (unsigned char)*s++; ch <<= 6;
269 case 0: ch += (unsigned char)*s++;
270 }
271 ch -= offsetsFromUTF8[nb];
272 w = wcwidth(ch); // might return -1
273 if (w > 0) tot += w;
274 }
275 }
276 return tot;
277 }
278
279 /* reads the next utf-8 sequence out of a string, updating an index */
280 uint32_t u8_nextchar(const char *s, size_t *i)
281 {
282 uint32_t ch = 0;
283 size_t sz = 0;
284
285 do {
286 ch <<= 6;
287 ch += (unsigned char)s[(*i)];
288 sz++;
289 } while (s[*i] && (++(*i)) && !isutf(s[*i]));
290 ch -= offsetsFromUTF8[sz-1];
291
292 return ch;
293 }
294
295 /* next character without NUL character terminator */
296 uint32_t u8_nextmemchar(const char *s, size_t *i)
297 {
298 uint32_t ch = 0;
299 size_t sz = 0;
300 do {
301 ch <<= 6;
302 ch += (unsigned char)s[(*i)++];
303 sz++;
304 } while (!isutf(s[*i]));
305 ch -= offsetsFromUTF8[sz-1];
306
307 return ch;
308 }
309
310 void u8_inc(const char *s, size_t *i)
311 {
312 (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i));
313 }
314
315 void u8_dec(const char *s, size_t *i)
316 {
317 (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i));
318 }
319
320 int octal_digit(char c)
321 {
322 return (c >= '0' && c <= '7');
323 }
324
325 int hex_digit(char c)
326 {
327 return ((c >= '0' && c <= '9') ||
328 (c >= 'A' && c <= 'F') ||
329 (c >= 'a' && c <= 'f'));
330 }
331
332 char read_escape_control_char(char c)
333 {
334 if (c == 'n')
335 return '\n';
336 else if (c == 't')
337 return '\t';
338 else if (c == 'r')
339 return '\r';
340 else if (c == 'e')
341 return 033; // '\e'
342 else if (c == 'b')
343 return '\b';
344 else if (c == 'f')
345 return '\f';
346 else if (c == 'v')
347 return '\v';
348 else if (c == 'a')
349 return '\a';
350 return c;
351 }
352
353 /* assumes that src points to the character after a backslash
354 returns number of input characters processed, 0 if error */
355 size_t u8_read_escape_sequence(const char *str, size_t ssz, uint32_t *dest)
356 {
357 uint32_t ch;
358 char digs[10];
359 int dno=0, ndig;
360 size_t i=1;
361 char c0 = str[0];
362 assert(ssz > 0);
363
364 if (octal_digit(c0)) {
365 i = 0;
366 do {
367 digs[dno++] = str[i++];
368 } while (i<ssz && octal_digit(str[i]) && dno<3);
369 digs[dno] = '\0';
370 ch = strtol(digs, NULL, 8);
371 }
372 else if ((c0=='x' && (ndig=2)) ||
373 (c0=='u' && (ndig=4)) ||
374 (c0=='U' && (ndig=8))) {
375 while (i<ssz && hex_digit(str[i]) && dno<ndig) {
376 digs[dno++] = str[i++];
377 }
378 if (dno == 0) return 0;
379 digs[dno] = '\0';
380 ch = strtol(digs, NULL, 16);
381 }
382 else {
383 ch = (uint32_t)read_escape_control_char(c0);
384 }
385 *dest = ch;
386
387 return i;
388 }
389
390 /* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
391 example: u8_unescape(mybuf, 256, "hello\\u220e")
392 note the double backslash is needed if called on a C string literal */
393 size_t u8_unescape(char *buf, size_t sz, const char *src)
394 {
395 size_t c=0, amt;
396 uint32_t ch = 0;
397 char temp[4];
398
399 while (*src && c < sz) {
400 if (*src == '\\') {
401 src++;
402 amt = u8_read_escape_sequence(src, 1000, &ch);
403 }
404 else {
405 ch = (uint32_t)*src;
406 amt = 1;
407 }
408 src += amt;
409 amt = u8_wc_toutf8(temp, ch);
410 if (amt > sz-c)
411 break;
412 memcpy(&buf[c], temp, amt);
413 c += amt;
414 }
415 if (c < sz)
416 buf[c] = '\0';
417 return c;
418 }
419
420 static int buf_put2c(char *buf, const char *src)
421 {
422 buf[0] = src[0];
423 buf[1] = src[1];
424 buf[2] = '\0';
425 return 2;
426 }
427
428 int u8_escape_wchar(char *buf, size_t sz, uint32_t ch)
429 {
430 assert(sz > 2);
431 if (ch == L'\n')
432 return buf_put2c(buf, "\\n");
433 else if (ch == L'\t')
434 return buf_put2c(buf, "\\t");
435 else if (ch == L'\r')
436 return buf_put2c(buf, "\\r");
437 else if (ch == 033) // L'\e'
438 return buf_put2c(buf, "\\e");
439 else if (ch == L'\b')
440 return buf_put2c(buf, "\\b");
441 else if (ch == L'\f')
442 return buf_put2c(buf, "\\f");
443 else if (ch == L'\v')
444 return buf_put2c(buf, "\\v");
445 else if (ch == L'\a')
446 return buf_put2c(buf, "\\a");
447 else if (ch == L'\\')
448 return buf_put2c(buf, "\\\\");
449 else if (ch < 32 || ch == 0x7f)
450 return snprintf(buf, sz, "\\x%.2hhx", (unsigned char)ch);
451 else if (ch > 0xFFFF)
452 return snprintf(buf, sz, "\\U%.8x", (uint32_t)ch);
453 else if (ch >= 0x80)
454 return snprintf(buf, sz, "\\u%.4hx", (unsigned short)ch);
455
456 buf[0] = (char)ch;
457 buf[1] = '\0';
458 return 1;
459 }
460
461 size_t u8_escape(char *buf, size_t sz, const char *src, size_t *pi, size_t end,
462 int escape_quotes, int ascii)
463 {
464 size_t i = *pi, i0;
465 uint32_t ch;
466 char *start = buf;
467 char *blim = start + sz-11;
468 assert(sz > 11);
469
470 while (i<end && buf<blim) {
471 // sz-11: leaves room for longest escape sequence
472 if (escape_quotes && src[i] == '"') {
473 buf += buf_put2c(buf, "\\\"");
474 i++;
475 }
476 else if (src[i] == '\\') {
477 buf += buf_put2c(buf, "\\\\");
478 i++;
479 }
480 else {
481 i0 = i;
482 ch = u8_nextmemchar(src, &i);
483 if (ascii || !iswprint((wint_t)ch)) {
484 buf += u8_escape_wchar(buf, sz - (buf-start), ch);
485 }
486 else {
487 i = i0;
488 do {
489 *buf++ = src[i++];
490 } while (!isutf(src[i]));
491 }
492 }
493 }
494 *buf++ = '\0';
495 *pi = i;
496 return (buf-start);
497 }
498
499 char *u8_strchr(const char *s, uint32_t ch, size_t *charn)
500 {
501 size_t i = 0, lasti=0;
502 uint32_t c;
503
504 *charn = 0;
505 while (s[i]) {
506 c = u8_nextchar(s, &i);
507 if (c == ch) {
508 /* it's const for us, but not necessarily the caller */
509 return (char*)&s[lasti];
510 }
511 lasti = i;
512 (*charn)++;
513 }
514 return NULL;
515 }
516
517 char *u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn)
518 {
519 size_t i = 0, lasti=0;
520 uint32_t c;
521 int csz;
522
523 *charn = 0;
524 while (i < sz) {
525 c = csz = 0;
526 do {
527 c <<= 6;
528 c += (unsigned char)s[i++];
529 csz++;
530 } while (i < sz && !isutf(s[i]));
531 c -= offsetsFromUTF8[csz-1];
532
533 if (c == ch) {
534 return (char*)&s[lasti];
535 }
536 lasti = i;
537 (*charn)++;
538 }
539 return NULL;
540 }
541
542 char *u8_memrchr(const char *s, uint32_t ch, size_t sz)
543 {
544 size_t i = sz-1, tempi=0;
545 uint32_t c;
546
547 if (sz == 0) return NULL;
548
549 while (i && !isutf(s[i])) i--;
550
551 while (1) {
552 tempi = i;
553 c = u8_nextmemchar(s, &tempi);
554 if (c == ch) {
555 return (char*)&s[i];
556 }
557 if (i == 0)
558 break;
559 tempi = i;
560 u8_dec(s, &i);
561 if (i > tempi)
562 break;
563 }
564 return NULL;
565 }
566
567 int u8_is_locale_utf8(const char *locale)
568 {
569 /* this code based on libutf8 */
570 const char* cp = locale;
571
572 if (locale == NULL) return 0;
573
574 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
575 if (*cp == '.') {
576 const char* encoding = ++cp;
577 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)
578 ;
579 if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))
580 || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))
581 return 1; /* it's UTF-8 */
582 break;
583 }
584 }
585 return 0;
586 }
587
588 size_t u8_vprintf(const char *fmt, va_list ap)
589 {
590 int cnt, sz=0, nc, needfree=0;
591 char *buf;
592 uint32_t *wcs;
593
594 sz = 512;
595 buf = (char*)alloca(sz);
596 cnt = vsnprintf(buf, sz, fmt, ap);
597 if (cnt < 0)
598 return 0;
599 if (cnt >= sz) {
600 buf = (char*)malloc(cnt + 1);
601 needfree = 1;
602 vsnprintf(buf, cnt+1, fmt, ap);
603 }
604 wcs = (uint32_t*)alloca((cnt+1) * sizeof(uint32_t));
605 nc = u8_toucs(wcs, (size_t)cnt+1, buf, cnt);
606 wcs[nc] = 0;
607 printf("%ls", (wchar_t*)wcs);
608 if (needfree) free(buf);
609 return nc;
610 }
611
612 size_t u8_printf(const char *fmt, ...)
613 {
614 size_t cnt;
615 va_list args;
616
617 va_start(args, fmt);
618
619 cnt = u8_vprintf(fmt, args);
620
621 va_end(args);
622 return cnt;
623 }
624
625 /* based on the valid_utf8 routine from the PCRE library by Philip Hazel
626
627 length is in bytes, since without knowing whether the string is valid
628 it's hard to know how many characters there are! */
629 int u8_isvalid(const char *str, size_t length)
630 {
631 const unsigned char *p, *pend = (unsigned char*)str + length;
632 unsigned char c;
633 int ret = 1; /* ASCII */
634 size_t ab;
635
636 for (p = (unsigned char*)str; p < pend; p++) {
637 c = *p;
638 if (c < 128)
639 continue;
640 ret = 2; /* non-ASCII UTF-8 */
641 if ((c & 0xc0) != 0xc0)
642 return 0;
643 ab = trailingBytesForUTF8[c];
644 if (length < ab)
645 return 0;
646 length -= ab;
647
648 p++;
649 /* Check top bits in the second byte */
650 if ((*p & 0xc0) != 0x80)
651 return 0;
652
653 /* Check for overlong sequences for each different length */
654 switch (ab) {
655 /* Check for xx00 000x */
656 case 1:
657 if ((c & 0x3e) == 0) return 0;
658 continue; /* We know there aren't any more bytes to check */
659
660 /* Check for 1110 0000, xx0x xxxx */
661 case 2:
662 if (c == 0xe0 && (*p & 0x20) == 0) return 0;
663 break;
664
665 /* Check for 1111 0000, xx00 xxxx */
666 case 3:
667 if (c == 0xf0 && (*p & 0x30) == 0) return 0;
668 break;
669
670 /* Check for 1111 1000, xx00 0xxx */
671 case 4:
672 if (c == 0xf8 && (*p & 0x38) == 0) return 0;
673 break;
674
675 /* Check for leading 0xfe or 0xff,
676 and then for 1111 1100, xx00 00xx */
677 case 5:
678 if (c == 0xfe || c == 0xff ||
679 (c == 0xfc && (*p & 0x3c) == 0)) return 0;
680 break;
681 }
682
683 /* Check for valid bytes after the 2nd, if any; all must start 10 */
684 while (--ab > 0) {
685 if ((*(++p) & 0xc0) != 0x80) return 0;
686 }
687 }
688
689 return ret;
690 }
691
692 int u8_reverse(char *dest, char * src, size_t len)
693 {
694 size_t si=0, di=len;
695 unsigned char c;
696
697 dest[di] = '\0';
698 while (si < len) {
699 c = (unsigned char)src[si];
700 if ((~c) & 0x80) {
701 di--;
702 dest[di] = c;
703 si++;
704 }
705 else {
706 switch (c>>4) {
707 case 0xC:
708 case 0xD:
709 di -= 2;
710 *((int16_t*)&dest[di]) = *((int16_t*)&src[si]);
711 si += 2;
712 break;
713 case 0xE:
714 di -= 3;
715 dest[di] = src[si];
716 *((int16_t*)&dest[di+1]) = *((int16_t*)&src[si+1]);
717 si += 3;
718 break;
719 case 0xF:
720 di -= 4;
721 *((int32_t*)&dest[di]) = *((int32_t*)&src[si]);
722 si += 4;
723 break;
724 default:
725 return 1;
726 }
727 }
728 }
729 return 0;
730 }