tenc-unicode.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP HTML git clone git://bitreich.org/clic/ git://hg6vgqziawt5s4dj.onion/clic/ DIR Log DIR Files DIR Refs DIR Tags DIR LICENSE --- tenc-unicode.lisp (42416B) --- 1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*- 2 ;;; 3 ;;; enc-unicode.lisp --- Unicode encodings. 4 ;;; 5 ;;; Copyright (C) 2007, Luis Oliveira <loliveira@common-lisp.net> 6 ;;; 7 ;;; Permission is hereby granted, free of charge, to any person 8 ;;; obtaining a copy of this software and associated documentation 9 ;;; files (the "Software"), to deal in the Software without 10 ;;; restriction, including without limitation the rights to use, copy, 11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies 12 ;;; of the Software, and to permit persons to whom the Software is 13 ;;; furnished to do so, subject to the following conditions: 14 ;;; 15 ;;; The above copyright notice and this permission notice shall be 16 ;;; included in all copies or substantial portions of the Software. 17 ;;; 18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 ;;; DEALINGS IN THE SOFTWARE. 26 27 ;;; This implementation is largely based on OpenMCL's l1-unicode.lisp 28 ;;; Copyright (C) 2006 Clozure Associates and contributors. 29 30 (in-package #:babel-encodings) 31 32 (eval-when (:compile-toplevel :load-toplevel :execute) 33 (defconstant +repl+ #xfffd "Unicode replacement character code point.") 34 (defconstant +byte-order-mark-code+ #xfeff) 35 (defconstant +swapped-byte-order-mark-code+ #xfffe) 36 (defconstant +swapped-byte-order-mark-code-32+ #xfffe0000)) 37 38 ;;; Some convenience macros adding FIXNUM declarations. 39 (defmacro f-ash (integer count) `(the fixnum (ash ,integer ,count))) 40 (defmacro f-logior (&rest integers) `(the fixnum (logior ,@integers))) 41 (defmacro f-logand (&rest integers) `(the fixnum (logand ,@integers))) 42 (defmacro f-logxor (&rest integers) `(the fixnum (logxor ,@integers))) 43 44 ;;;; UTF-8 45 46 (define-character-encoding :utf-8 47 "An 8-bit, variable-length character encoding in which 48 character code points in the range #x00-#x7f can be encoded in a 49 single octet; characters with larger code values can be encoded 50 in 2 to 4 bytes." 51 :max-units-per-char 4 52 :literal-char-code-limit #x80 53 :bom-encoding #(#xef #xbb #xbf) 54 :default-replacement #xfffd) 55 56 (define-condition invalid-utf8-starter-byte (character-decoding-error) 57 () 58 (:documentation "Signalled when an invalid UTF-8 starter byte is found.")) 59 60 (define-condition invalid-utf8-continuation-byte (character-decoding-error) 61 () 62 (:documentation 63 "Signalled when an invalid UTF-8 continuation byte is found.")) 64 65 (define-condition overlong-utf8-sequence (character-decoding-error) 66 () 67 (:documentation "Signalled upon overlong UTF-8 sequences.")) 68 69 (define-octet-counter :utf-8 (getter type) 70 `(named-lambda utf-8-octet-counter (seq start end max) 71 (declare (type ,type seq) (fixnum start end max)) 72 (loop with noctets fixnum = 0 73 for i fixnum from start below end 74 for code of-type code-point = (,getter seq i) do 75 (let ((new (+ (cond ((< code #x80) 1) 76 ((< code #x800) 2) 77 ((< code #x10000) 3) 78 (t 4)) 79 noctets))) 80 (if (and (plusp max) (> new max)) 81 (loop-finish) 82 (setq noctets new))) 83 finally (return (values noctets i))))) 84 85 (define-code-point-counter :utf-8 (getter type) 86 `(named-lambda utf-8-code-point-counter (seq start end max) 87 (declare (type ,type seq) (fixnum start end max)) 88 (loop with nchars fixnum = 0 89 with i fixnum = start 90 while (< i end) do 91 ;; check for invalid continuation bytes 92 (macrolet ((invalid-cb-p (n) 93 `(and (< (+ i ,n) end) 94 (not (< #x7f (,',getter seq (+ i ,n)) #xc0))))) 95 ;; wrote this code with LET instead of FOR because CLISP's 96 ;; LOOP doesn't like WHILE clauses before FOR clauses. 97 (let* ((octet (,getter seq i)) 98 (next-i (+ i (cond ((or (< octet #xc0) (invalid-cb-p 1)) 1) 99 ((or (< octet #xe0) (invalid-cb-p 2)) 2) 100 ((or (< octet #xf0) (invalid-cb-p 3)) 3) 101 ((or (< octet #xf8) (invalid-cb-p 4)) 4) 102 ((or (< octet #xfc) (invalid-cb-p 5)) 5) 103 (t 6))))) 104 (declare (type ub8 octet) (fixnum next-i)) 105 (cond 106 ((> next-i end) 107 ;; Should we add restarts to this error, we'll have 108 ;; to figure out a way to communicate with the 109 ;; decoder since we probably want to do something 110 ;; about it right here when we have a chance to 111 ;; change the count or something. (Like an 112 ;; alternative replacement character or perhaps the 113 ;; existence of this error so that the decoder 114 ;; doesn't have to check for it on every iteration 115 ;; like we do.) 116 ;; 117 ;; FIXME: The data for this error is not right. 118 (decoding-error (vector octet) :utf-8 seq i 119 nil 'end-of-input-in-character) 120 (return (values (1+ nchars) end))) 121 (t 122 (setq nchars (1+ nchars) 123 i next-i) 124 (when (and (plusp max) (= nchars max)) 125 (return (values nchars i))))))) 126 finally (progn 127 (assert (= i end)) 128 (return (values nchars i)))))) 129 130 (define-encoder :utf-8 (getter src-type setter dest-type) 131 `(named-lambda utf-8-encoder (src start end dest d-start) 132 (declare (type ,src-type src) 133 (type ,dest-type dest) 134 (fixnum start end d-start)) 135 (loop with di fixnum = d-start 136 for i fixnum from start below end 137 for code of-type code-point = (,getter src i) do 138 (macrolet ((set-octet (offset value) 139 `(,',setter ,value dest (the fixnum (+ di ,offset))))) 140 (cond 141 ;; 1 octet 142 ((< code #x80) 143 (set-octet 0 code) 144 (incf di)) 145 ;; 2 octets 146 ((< code #x800) 147 (set-octet 0 (logior #xc0 (f-ash code -6))) 148 (set-octet 1 (logior #x80 (f-logand code #x3f))) 149 (incf di 2)) 150 ;; 3 octets 151 ((< code #x10000) 152 (set-octet 0 (logior #xe0 (f-ash code -12))) 153 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6)))) 154 (set-octet 2 (logior #x80 (f-logand code #x3f))) 155 (incf di 3)) 156 ;; 4 octets 157 (t 158 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18)))) 159 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12)))) 160 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6)))) 161 (set-octet 3 (logior #x80 (logand code #x3f))) 162 (incf di 4)))) 163 finally (return (the fixnum (- di d-start)))))) 164 165 (define-decoder :utf-8 (getter src-type setter dest-type) 166 `(named-lambda utf-8-decoder (src start end dest d-start) 167 (declare (type ,src-type src) 168 (type ,dest-type dest) 169 (fixnum start end d-start)) 170 (let ((u2 0) (u3 0) (u4 0) (u5 0) (u6 0)) 171 (declare (type ub8 u2 u3 u4 u5 u6)) 172 (loop for di fixnum from d-start 173 for i fixnum from start below end 174 for u1 of-type ub8 = (,getter src i) do 175 ;; Note: CONSUME-OCTET doesn't check if I is being 176 ;; incremented past END. We're assuming that END has 177 ;; been calculated with the CODE-POINT-POINTER above that 178 ;; checks this. 179 (macrolet 180 ((consume-octet () 181 `(let ((next-i (incf i))) 182 (if (= next-i end) 183 ;; FIXME: data for this error is incomplete. 184 ;; and signalling this error twice 185 (return-from setter-block 186 (decoding-error nil :utf-8 src i +repl+ 187 'end-of-input-in-character)) 188 (,',getter src next-i)))) 189 (handle-error (n &optional (c 'character-decoding-error)) 190 `(decoding-error 191 (vector ,@(subseq '(u1 u2 u3 u4 u5 u6) 0 n)) 192 :utf-8 src (1+ (- i ,n)) +repl+ ',c)) 193 (handle-error-if-icb (var n) 194 `(when (not (< #x7f ,var #xc0)) 195 (decf i) 196 (return-from setter-block 197 (handle-error ,n invalid-utf8-continuation-byte))))) 198 (,setter 199 (block setter-block 200 (cond 201 ((< u1 #x80) u1) ; 1 octet 202 ((< u1 #xc0) 203 (handle-error 1 invalid-utf8-starter-byte)) 204 (t 205 (setq u2 (consume-octet)) 206 (handle-error-if-icb u2 1) 207 (cond 208 ((< u1 #xc2) 209 (handle-error 2 overlong-utf8-sequence)) 210 ((< u1 #xe0) ; 2 octets 211 (logior (f-ash (f-logand #x1f u1) 6) 212 (f-logxor u2 #x80))) 213 (t 214 (setq u3 (consume-octet)) 215 (handle-error-if-icb u3 2) 216 (cond 217 ((and (= u1 #xe0) (< u2 #xa0)) 218 (handle-error 3 overlong-utf8-sequence)) 219 ((< u1 #xf0) ; 3 octets 220 (let ((start (f-logior (f-ash (f-logand u1 #x0f) 12) 221 (f-ash (f-logand u2 #x3f) 6)))) 222 (if (<= #xd800 start #xdfc0) 223 (handle-error 3 character-out-of-range) 224 (logior start (f-logand u3 #x3f))))) 225 (t ; 4 octets 226 (setq u4 (consume-octet)) 227 (handle-error-if-icb u4 3) 228 (cond 229 ((and (= u1 #xf0) (< u2 #x90)) 230 (handle-error 4 overlong-utf8-sequence)) 231 ((< u1 #xf8) 232 (if (or (> u1 #xf4) (and (= u1 #xf4) (> u2 #x8f))) 233 (handle-error 4 character-out-of-range) 234 (f-logior (f-ash (f-logand u1 7) 18) 235 (f-ash (f-logxor u2 #x80) 12) 236 (f-ash (f-logxor u3 #x80) 6) 237 (f-logxor u4 #x80)))) 238 ;; from here on we'll be getting either 239 ;; invalid continuation bytes or overlong 240 ;; 5-byte or 6-byte sequences. 241 (t 242 (setq u5 (consume-octet)) 243 (handle-error-if-icb u5 4) 244 (cond 245 ((and (= u1 #xf8) (< u2 #x88)) 246 (handle-error 5 overlong-utf8-sequence)) 247 ((< u1 #xfc) 248 (handle-error 5 character-out-of-range)) 249 (t 250 (setq u6 (consume-octet)) 251 (handle-error-if-icb u6 5) 252 (cond 253 ((and (= u1 #xfc) (< u2 #x84)) 254 (handle-error 6 overlong-utf8-sequence)) 255 (t 256 (handle-error 6 character-out-of-range) 257 ))))))))))))) 258 dest di)) 259 finally (return (the fixnum (- di d-start))))))) 260 261 ;;;; UTF-8B 262 263 ;;; The following excerpt from a linux-utf8 message by Markus Kuhn is 264 ;;; the closest thing to a UTF-8B specification: 265 ;;; 266 ;;; <http://mail.nl.linux.org/linux-utf8/2000-07/msg00040.html> 267 ;;; 268 ;;; "D) Emit a malformed UTF-16 sequence for every byte in a malformed 269 ;;; UTF-8 sequence 270 ;;; 271 ;;; All the previous options for converting malformed UTF-8 sequences 272 ;;; to UTF-16 destroy information. This can be highly undesirable in 273 ;;; applications such as text file editors, where guaranteed binary 274 ;;; transparency is a desireable feature. (E.g., I frequently edit 275 ;;; executable code or graphic files with the Emacs text editor and I 276 ;;; hate the idea that my editor might automatically make U+FFFD 277 ;;; substitutions at locations that I haven't even edited when I save 278 ;;; the file again.) 279 ;;; 280 ;;; I therefore suggested 1999-11-02 on the unicode@xxxxxxxxxxx 281 ;;; mailing list the following approach. Instead of using U+FFFD, 282 ;;; simply encode malformed UTF-8 sequences as malformed UTF-16 283 ;;; sequences. Malformed UTF-8 sequences consist excludively of the 284 ;;; bytes 0x80 - 0xff, and each of these bytes can be represented 285 ;;; using a 16-bit value from the UTF-16 low-half surrogate zone 286 ;;; U+DC80 to U+DCFF. Thus, the overlong "K" (U+004B) 0xc1 0x8b from 287 ;;; the above example would be represented in UTF-16 as U+DCC1 288 ;;; U+DC8B. If we simply make sure that every UTF-8 encoded surrogate 289 ;;; character is also treated like a malformed sequence, then there 290 ;;; is no way that a single high-half surrogate could precede the 291 ;;; encoded malformed sequence and cause a valid UTF-16 sequence to 292 ;;; emerge. 293 ;;; 294 ;;; This way 100% binary transparent UTF-8 -> UTF-16 -> UTF-8 295 ;;; round-trip compatibility can be achieved quite easily. 296 ;;; 297 ;;; On an output device, a lonely low-half surrogate character should 298 ;;; be treated just like a character outside the adopted subset of 299 ;;; representable characters, that is for the end user, the display 300 ;;; would look exactly like with semantics B), i.e. one symbol per 301 ;;; byte of a malformed sequence. However in contrast to semantics 302 ;;; B), no information is thrown away, and a cut&paste in an editor 303 ;;; or terminal emulator will be guaranteed to reconstruct the 304 ;;; original byte sequence. This should greatly reduce the incidence 305 ;;; of accidental corruption of binary data by UTF-8 -> UTF-16 -> 306 ;;; UTF-8 conversion round trips." 307 308 (define-character-encoding :utf-8b 309 "An 8-bit, variable-length character encoding in which 310 character code points in the range #x00-#x7f can be encoded in a 311 single octet; characters with larger code values can be encoded 312 in 2 to 4 bytes. Invalid UTF-8 sequences are encoded with #xDCXX 313 code points for each invalid byte." 314 :max-units-per-char 4 315 :literal-char-code-limit #x80 316 :bom-encoding #(#xef #xbb #xbf) 317 :default-replacement nil) 318 319 ;;; TODO: reuse the :UTF-8 octet counter through a simple macro. 320 (define-octet-counter :utf-8b (getter type) 321 `(named-lambda utf-8b-octet-counter (seq start end max) 322 (declare (type ,type seq) (fixnum start end max)) 323 (loop with noctets fixnum = 0 324 for i fixnum from start below end 325 for code of-type code-point = (,getter seq i) do 326 (let ((new (+ (cond ((< code #x80) 1) 327 ((< code #x800) 2) 328 ((<= #xdc80 code #xdcff) 1) 329 ((< code #x10000) 3) 330 (t 4)) 331 noctets))) 332 (if (and (plusp max) (> new max)) 333 (loop-finish) 334 (setq noctets new))) 335 finally (return (values noctets i))))) 336 337 (define-code-point-counter :utf-8b (getter type) 338 `(named-lambda utf-8b-code-point-counter (seq start end max) 339 (declare (type ,type seq) (fixnum start end max)) 340 (loop with nchars fixnum = 0 341 with i fixnum = start 342 while (< i end) do 343 ;; wrote this code with LET instead of FOR because CLISP's 344 ;; LOOP doesn't like WHILE clauses before FOR clauses. 345 (let* ((octet (,getter seq i)) 346 (noctets (cond ((< octet #x80) 1) 347 ((< octet #xe0) 2) 348 ((< octet #xf0) 3) 349 (t 4)))) 350 (declare (type ub8 octet) (fixnum noctets)) 351 (cond 352 ((> (+ i noctets) end) 353 ;; If this error is suppressed these last few bytes 354 ;; will be encoded as raw bytes later. 355 (decoding-error (vector octet) :utf-8 seq i 356 nil 'end-of-input-in-character) 357 (return (values (+ nchars (- end i)) end))) 358 (t 359 ;; FIXME: clean this mess up. 360 (let* ((u1 octet) 361 (u2 (if (>= noctets 2) (,getter seq (1+ i)) 0)) 362 (u3 (if (>= noctets 3) (,getter seq (+ i 2)) 0)) 363 (u4 (if (= noctets 4) (,getter seq (+ i 3)) 0)) 364 (inc (or (and (> noctets 1) 365 (< u1 #xc2)) 366 (and (= noctets 2) 367 (not (logior u2 #x40))) 368 (and (= noctets 3) 369 (not (and (< (f-logxor u2 #x80) #x40) 370 (< (f-logxor u3 #x80) #x40) 371 (or (>= u1 #xe1) (>= u2 #xa0)) 372 (or (/= u1 #xed) (< u2 #xa0) (> u2 #xbf))))) 373 (and (= noctets 4) 374 (not 375 (and (< (f-logxor u2 #x80) #x40) 376 (< (f-logxor u3 #x80) #x40) 377 (< (f-logxor u4 #x80) #x40) 378 (or (>= u1 #xf1) (>= u2 #x90)))))))) 379 (let ((new-nchars (if inc (+ nchars noctets) (1+ nchars)))) 380 (when (and (plusp max) (> new-nchars max)) 381 (return (values nchars i))) 382 (incf i noctets) 383 (setq nchars new-nchars)))))) 384 finally (progn 385 (assert (= i end)) 386 (return (values nchars i)))))) 387 388 ;;; TODO: reuse the :UTF-8 encoder with through a simple macro. 389 (define-encoder :utf-8b (getter src-type setter dest-type) 390 `(named-lambda utf-8b-encoder (src start end dest d-start) 391 (declare (type ,src-type src) 392 (type ,dest-type dest) 393 (fixnum start end d-start)) 394 (loop with di fixnum = d-start 395 for i fixnum from start below end 396 for code of-type code-point = (,getter src i) do 397 (macrolet ((set-octet (offset value) 398 `(,',setter ,value dest (the fixnum (+ di ,offset))))) 399 (cond 400 ;; 1 octet 401 ((< code #x80) 402 (set-octet 0 code) 403 (incf di)) 404 ;; 2 octets 405 ((< code #x800) 406 (set-octet 0 (logior #xc0 (f-ash code -6))) 407 (set-octet 1 (logior #x80 (f-logand code #x3f))) 408 (incf di 2)) 409 ;; 1 octet (invalid octet) 410 ((<= #xdc80 code #xdcff) 411 (set-octet 0 (f-logand code #xff)) 412 (incf di)) 413 ;; 3 octets 414 ((< code #x10000) 415 (set-octet 0 (logior #xe0 (f-ash code -12))) 416 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6)))) 417 (set-octet 2 (logior #x80 (f-logand code #x3f))) 418 (incf di 3)) 419 ;; 4 octets 420 (t 421 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18)))) 422 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12)))) 423 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6)))) 424 (set-octet 3 (logand #x3f code)) 425 (incf di 4)))) 426 finally (return (the fixnum (- di d-start)))))) 427 428 (define-decoder :utf-8b (getter src-type setter dest-type) 429 `(named-lambda utf-8b-decoder (src start end dest d-start) 430 (declare (type ,src-type src) 431 (type ,dest-type dest) 432 (fixnum start end d-start)) 433 (let ((u2 0) (u3 0) (u4 0)) 434 (declare (type ub8 u2 u3 u4)) 435 (loop for di fixnum from d-start 436 for i fixnum from start below end 437 for u1 of-type ub8 = (,getter src i) do 438 ;; Unlike the UTF-8 version, this version of 439 ;; CONSUME-OCTET needs to check if I is being incremented 440 ;; past END because we might have trailing binary 441 ;; garbage. 442 (macrolet 443 ((consume-octet (n) 444 `(if (= i (1- end)) 445 (encode-raw-octets ,n) 446 (,',getter src (incf i)))) 447 (encode-raw-octets (n) 448 `(progn 449 ,@(loop for i below n and var in '(u1 u2 u3 u4) 450 collect `(,',setter (logior #xdc00 ,var) dest di) 451 unless (= i (1- n)) 452 collect '(incf di)) 453 (return-from set-body)))) 454 (block set-body 455 (,setter (cond 456 ((< u1 #x80) ; 1 octet 457 u1) 458 ((>= u1 #xc2) 459 (setq u2 (consume-octet 1)) 460 (cond 461 ((< u1 #xe0) ; 2 octets 462 (if (< (f-logxor u2 #x80) #x40) 463 (logior (f-ash (f-logand #x1f u1) 6) 464 (f-logxor u2 #x80)) 465 (encode-raw-octets 2))) 466 (t 467 (setq u3 (consume-octet 2)) 468 (cond 469 ((< u1 #xf0) ; 3 octets 470 (if (and (< (f-logxor u2 #x80) #x40) 471 (< (f-logxor u3 #x80) #x40) 472 (or (>= u1 #xe1) (>= u2 #xa0))) 473 (let ((start (f-logior (f-ash (f-logand u1 #x0f) 12) 474 (f-ash (f-logand u2 #x3f) 6)))) 475 (if (<= #xd800 start #xdfc0) 476 (encode-raw-octets 3) 477 (logior start (f-logand u3 #x3f)))) 478 (encode-raw-octets 3))) 479 (t ; 4 octets 480 (setq u4 (consume-octet 3)) 481 (if (and (< (f-logxor u2 #x80) #x40) 482 (< (f-logxor u3 #x80) #x40) 483 (< (f-logxor u4 #x80) #x40) 484 (or (>= u1 #xf1) (>= u2 #x90))) 485 (logior 486 (f-logior (f-ash (f-logand u1 7) 18) 487 (f-ash (f-logxor u2 #x80) 12)) 488 (f-logior (f-ash (f-logxor u3 #x80) 6) 489 (f-logxor u4 #x80))) 490 (encode-raw-octets 4))))))) 491 (t (encode-raw-octets 1))) 492 dest di))) 493 finally (return (the fixnum (- di d-start))))))) 494 495 ;;;; UTF-16 496 497 ;;; TODO: add a way to pass some info at compile-time telling us that, 498 ;;; for example, the maximum code-point will always be < #x10000 in 499 ;;; which case we could simply return (* 2 (- end start)). 500 (defmacro utf16-octet-counter (getter type) 501 `(named-lambda utf-16-octet-counter (seq start end max) 502 (declare (type ,type seq) (fixnum start end max)) 503 (loop with noctets fixnum = 0 504 for i fixnum from start below end 505 for code of-type code-point = (,getter seq i) 506 do (let ((new (the fixnum (+ (if (< code #x10000) 2 4) noctets)))) 507 (if (and (plusp max) (> new max)) 508 (loop-finish) 509 (setq noctets new))) 510 finally (return (values noctets i))))) 511 512 (defmacro utf-16-combine-surrogate-pairs (u1 u2) 513 `(the (unsigned-byte 21) 514 (+ #x10000 515 (the (unsigned-byte 20) 516 (logior 517 (the (unsigned-byte 20) 518 (ash (the (unsigned-byte 10) (- ,u1 #xd800)) 10)) 519 (the (unsigned-byte 10) 520 (- ,u2 #xdc00))))))) 521 522 (defmacro define-utf-16 (name &optional endianness) 523 (check-type endianness (or null (eql :be) (eql :le))) 524 (check-type name keyword) 525 (let ((swap-var (gensym "SWAP")) 526 (code-point-counter-name 527 (format-symbol t '#:~a-code-point-counter (string name))) 528 (encoder-name (format-symbol t '#:~a-encoder (string name))) 529 (decoder-name (format-symbol t '#:~a-decoder (string name)))) 530 (labels ((make-bom-check-form (end start getter seq) 531 (if (null endianness) 532 ``((,',swap-var 533 (when (> ,,end ,,start) 534 (case (,,getter ,,seq ,,start 2 :ne) 535 (#.+byte-order-mark-code+ (incf ,,start 2) nil) 536 (#.+swapped-byte-order-mark-code+ (incf ,,start 2) t) 537 (t #+little-endian t))))) 538 '())) 539 (make-getter-form (getter src i) 540 (case endianness 541 (:le ``(,,getter ,,src ,,i 2 :le)) 542 (:be ``(,,getter ,,src ,,i 2 :be)) 543 (t ``(if ,',swap-var 544 (,,getter ,,src ,,i 2 :re) 545 (,,getter ,,src ,,i 2 :ne))))) 546 (make-setter-form (setter code dest di) 547 (case endianness 548 (:be ``(,,setter ,,code ,,dest ,,di 2 :be)) 549 (:le ``(,,setter ,,code ,,dest ,,di 2 :le)) 550 (t ``(,,setter ,,code ,,dest ,,di 2 :ne))))) 551 `(progn 552 (define-octet-counter ,name (getter type) 553 `(utf16-octet-counter ,getter ,type)) 554 (define-code-point-counter ,name (getter type) 555 `(named-lambda ,',code-point-counter-name (seq start end max) 556 (declare (type ,type seq) (fixnum start end max)) 557 (let* ,,(make-bom-check-form ''end ''start 'getter ''seq) 558 (loop with count fixnum = 0 559 with i fixnum = start 560 while (<= i (- end 2)) do 561 (let* ((code ,,(make-getter-form 'getter ''seq ''i)) 562 (next-i (+ i (if (or (< code #xd800) (>= code #xdc00)) 563 2 564 4)))) 565 (declare (type (unsigned-byte 16) code) (fixnum next-i)) 566 (cond 567 ((> next-i end) 568 (decoding-error 569 (vector (,getter seq i) (,getter seq (1+ i))) 570 ,',name seq i nil 'end-of-input-in-character) 571 (return (values count i))) 572 (t 573 (setq i next-i 574 count (1+ count)) 575 (when (and (plusp max) (= count max)) 576 (return (values count i)))))) 577 finally (progn 578 (assert (= i end)) 579 (return (values count i))))))) 580 (define-encoder ,name (getter src-type setter dest-type) 581 `(named-lambda ,',encoder-name (src start end dest d-start) 582 (declare (type ,src-type src) 583 (type ,dest-type dest) 584 (fixnum start end d-start)) 585 (loop with di fixnum = d-start 586 for i fixnum from start below end 587 for code of-type code-point = (,getter src i) 588 for high-bits fixnum = (- code #x10000) do 589 (cond ((< high-bits 0) 590 ,,(make-setter-form 'setter ''code ''dest ''di) 591 (incf di 2)) 592 (t 593 ,,(make-setter-form 594 'setter ''(logior #xd800 (f-ash high-bits -10)) 595 ''dest ''di) 596 ,,(make-setter-form 597 'setter ''(logior #xdc00 (f-logand high-bits #x3ff)) 598 ''dest ''(+ di 2)) 599 (incf di 4))) 600 finally (return (the fixnum (- di d-start)))))) 601 (define-decoder ,name (getter src-type setter dest-type) 602 `(named-lambda ,',decoder-name (src start end dest d-start) 603 (declare (type ,src-type src) 604 (type ,dest-type dest) 605 (fixnum start end d-start)) 606 (let ,,(make-bom-check-form ''end ''start 'getter ''src) 607 (loop with i fixnum = start 608 for di fixnum from d-start 609 until (= i end) do 610 (let ((u1 ,,(make-getter-form 'getter ''src ''i))) 611 (declare (type (unsigned-byte 16) u1)) 612 (incf i 2) 613 (,setter (cond 614 ((or (< u1 #xd800) (>= u1 #xe000)) ; 2 octets 615 u1) 616 ((< u1 #xdc00) ; 4 octets 617 (let ((u2 ,,(make-getter-form 'getter ''src ''i))) 618 (declare (type (unsigned-byte 16) u2)) 619 (incf i 2) 620 (if (and (>= u2 #xdc00) (< u2 #xe000)) 621 (utf-16-combine-surrogate-pairs u1 u2) 622 (decoding-error 623 (vector (,getter src (- i 4)) 624 (,getter src (- i 3)) 625 (,getter src (- i 2)) 626 (,getter src (- i 1))) 627 ,',name src i +repl+)))) 628 (t 629 (decoding-error (vector (,getter src (- i 2)) 630 (,getter src (- i 1))) 631 ,',name src i +repl+))) 632 dest di)) 633 finally (return (the fixnum (- di d-start))))))) 634 ',name)))) 635 636 (define-character-encoding :utf-16 637 "A 16-bit, variable-length encoding in which characters with 638 code points less than #x10000 can be encoded in a single 16-bit 639 word and characters with larger codes can be encoded in a pair of 640 16-bit words. The endianness of the encoded data is indicated by 641 the endianness of a byte-order-mark character (#\u+feff) 642 prepended to the data; in the absence of such a character on 643 input, the data is assumed to be in big-endian order. Output is 644 written in native byte-order with a leading byte-order mark." 645 :max-units-per-char 2 646 :code-unit-size 16 647 :native-endianness t ; not necessarily true when decoding 648 :decode-literal-code-unit-limit #xd800 649 :encode-literal-code-unit-limit #x10000 650 :use-bom #+big-endian :utf-16be #+little-endian :utf-16le 651 :bom-encoding #+big-endian #(#xfe #xff) #+little-endian #(#xff #xfe) 652 :nul-encoding #(0 0) 653 :default-replacement #xfffd 654 :ambiguous #+little-endian t #+big-endian nil) 655 656 (define-utf-16 :utf-16) 657 658 (define-character-encoding :utf-16le 659 "A 16-bit, variable-length encoding in which characters with 660 code points less than #x10000 can be encoded in a single 16-bit 661 word and characters with larger codes can be encoded in a pair of 662 16-bit words. The data is assumed to be in little-endian order. Output is 663 written in little-endian byte-order without a leading byte-order mark." 664 :aliases '(:utf-16/le) 665 :max-units-per-char 2 666 :code-unit-size 16 667 :native-endianness #+little-endian t #+big-endian nil 668 :decode-literal-code-unit-limit #xd800 669 :encode-literal-code-unit-limit #x10000 670 :nul-encoding #(0 0) 671 :default-replacement #xfffd) 672 673 (define-utf-16 :utf-16le :le) 674 675 (define-character-encoding :utf-16be 676 "A 16-bit, variable-length encoding in which characters with 677 code points less than #x10000 can be encoded in a single 16-bit 678 word and characters with larger codes can be encoded in a pair of 679 16-bit words. The data is assumed to be in big-endian order. Output is 680 written in big-endian byte-order without a leading byte-order mark." 681 :aliases '(:utf-16/be) 682 :max-units-per-char 2 683 :code-unit-size 16 684 :native-endianness #+little-endian nil #+big-endian t 685 :decode-literal-code-unit-limit #xd800 686 :encode-literal-code-unit-limit #x10000 687 :nul-encoding #(0 0) 688 :default-replacement #xfffd) 689 690 (define-utf-16 :utf-16be :be) 691 692 (defmacro define-ucs (name bytes &optional endianness (limit #x110000)) 693 (check-type name keyword) 694 (check-type bytes (or (eql 2) (eql 4))) 695 (check-type endianness (or null (eql :le) (eql :be))) 696 (let ((swap-var (gensym "SWAP")) 697 (code-point-counter-name 698 (format-symbol t '#:~a-code-point-counter (string name))) 699 (encoder-name 700 (format-symbol t '#:~a-encoder (string name))) 701 (decoder-name 702 (format-symbol t '#:~a-decoder (string name)))) 703 (labels ((make-bom-check-form (end start getter src) 704 (if (null endianness) 705 ``(when (not (zerop (- ,,end ,,start))) 706 (case (,,getter ,,src 0 ,',bytes :ne) 707 (#.+byte-order-mark-code+ 708 (incf ,,start ,',bytes) nil) 709 (#.+swapped-byte-order-mark-code-32+ 710 (incf ,,start ,',bytes) t) 711 (t #+little-endian t))) 712 '())) 713 (make-setter-form (setter code dest di) 714 ``(,,setter ,,code ,,dest ,,di ,',bytes 715 ,',(or endianness :ne))) 716 (make-getter-form (getter src i) 717 (if (null endianness) 718 ``(if ,',swap-var 719 (,,getter ,,src ,,i ,',bytes :re) 720 (,,getter ,,src ,,i ,',bytes :ne)) 721 ``(,,getter ,,src ,,i ,',bytes ,',endianness)))) 722 `(progn 723 (define-code-point-counter ,name (getter type) 724 `(named-lambda ,',code-point-counter-name (seq start end max) 725 (declare (type ,type seq) (fixnum start end max)) 726 ;; check for bom 727 ,,(make-bom-check-form ''end ''start 'getter ''seq) 728 (multiple-value-bind (count rem) 729 (floor (- end start) ,',bytes) 730 (cond 731 ((and (plusp max) (> count max)) 732 (values max (the fixnum (+ start (* ,',bytes max))))) 733 (t 734 ;; check for incomplete last character 735 (unless (zerop rem) 736 (let ((vector (make-array ,',bytes :fill-pointer 0))) 737 (dotimes (i rem) 738 (vector-push (,getter seq (+ i (- end rem))) vector)) 739 (decoding-error vector ,',name seq (the fixnum (- end rem)) nil 740 'end-of-input-in-character) 741 (decf end rem))) 742 (values count end)))))) 743 (define-encoder ,name (getter src-type setter dest-type) 744 `(named-lambda ,',encoder-name (src start end dest d-start) 745 (declare (type ,src-type src) 746 (type ,dest-type dest) 747 (fixnum start end d-start)) 748 (loop for i fixnum from start below end 749 and di fixnum from d-start by ,',bytes 750 for code of-type code-point = (,getter src i) 751 do (if (>= code ,',limit) 752 (encoding-error code ,',name src i +repl+) 753 ,,(make-setter-form 'setter ''code ''dest ''di)) 754 finally (return (the fixnum (- di d-start)))))) 755 (define-decoder ,name (getter src-type setter dest-type) 756 `(named-lambda ,',decoder-name (src start end dest d-start) 757 (declare (type ,src-type src) 758 (type ,dest-type dest) 759 (fixnum start end d-start)) 760 (let ((,',swap-var ,,(make-bom-check-form ''end ''start 'getter ''src))) 761 (declare (ignorable ,',swap-var)) 762 (loop for i fixnum from start below end by ,',bytes 763 and di from d-start 764 do (,setter (let ((unit ,,(make-getter-form 'getter ''src ''i))) 765 (if (>= unit ,',limit) 766 (decoding-error 767 (vector (,getter src i) 768 (,getter src (+ i 1)) 769 ,@,(if (= bytes 4) 770 ``((,getter src (+ i 2)) 771 (,getter src (+ i 3))))) 772 ,',name src i +repl+ 773 'character-out-of-range) 774 unit)) 775 dest di) 776 finally (return (the fixnum (- di d-start))))))) 777 ',name)))) 778 779 ;;;; UTF-32 780 781 (define-character-encoding :utf-32 782 "A 32-bit, fixed-length encoding in which all Unicode 783 characters can be encoded in a single 32-bit word. The 784 endianness of the encoded data is indicated by the endianness of 785 a byte-order-mark character (#\u+feff) prepended to the data; in 786 the absence of such a character on input, input data is assumed 787 to be in big-endian order. Output is written in native byte 788 order with a leading byte-order mark." 789 :aliases '(:ucs-4) 790 :max-units-per-char 1 791 :code-unit-size 32 792 :native-endianness t ; not necessarily true when decoding 793 :literal-char-code-limit #x110000 794 :use-bom #+little-endian :utf-32le #+big-endian :utf-32be 795 :bom-encoding 796 #+big-endian #(#x00 #x00 #xfe #xff) 797 #+little-endian #(#xff #xfe #x00 #x00) 798 :nul-encoding #(0 0 0 0) 799 :ambiguous #+little-endian t #+big-endian nil) 800 801 (define-ucs :utf-32 4) 802 803 (define-character-encoding :utf-32le 804 "A 32-bit, fixed-length encoding in which all Unicode 805 characters can be encoded in a single 32-bit word. Input data is assumed 806 to be in little-endian order. Output is also written in little-endian byte 807 order without a leading byte-order mark." 808 :max-units-per-char 1 809 :code-unit-size 32 810 :aliases '(:utf-32/le :ucs-4le :ucs-4/le) 811 :native-endianness #+little-endian t #+big-endian nil 812 :literal-char-code-limit #x110000 813 :nul-encoding #(0 0 0 0)) 814 815 (define-ucs :utf-32le 4 :le) 816 817 (define-character-encoding :utf-32be 818 "A 32-bit, fixed-length encoding in which all Unicode 819 characters can be encoded in a single 32-bit word. Input data is assumed 820 to be in big-endian order. Output is also written in big-endian byte 821 order without a leading byte-order mark." 822 :max-units-per-char 1 823 :code-unit-size 32 824 :aliases '(:utf-32/be :ucs-4be :ucs-4/be) 825 :native-endianness #+little-endian nil #+big-endian t 826 :literal-char-code-limit #x110000 827 :nul-encoding #(0 0 0 0)) 828 829 (define-ucs :utf-32be 4 :be) 830 831 ;; UCS-2 832 833 (define-character-encoding :ucs-2 834 "A 16-bit, fixed-length encoding in which all Unicode 835 characters can be encoded in a single 16-bit word. The 836 endianness of the encoded data is indicated by the endianness of 837 a byte-order-mark character (#\u+feff) prepended to the data; in 838 the absence of such a character on input, input data is assumed 839 to be in big-endian order. Output is written in native byte 840 order with a leading byte-order mark." 841 :aliases '(:ucs-2) 842 :max-units-per-char 1 843 :code-unit-size 16 844 :native-endianness t ; not necessarily true when decoding 845 :literal-char-code-limit #x10000 846 :use-bom #+little-endian :ucs-2le #+big-endian :ucs-2be 847 :bom-encoding 848 #+big-endian #(#xfe #xff) 849 #+little-endian #(#xff #xfe) 850 :nul-encoding #(0 0) 851 :ambiguous #+little-endian t #+big-endian nil) 852 853 (define-ucs :ucs-2 2 nil #x10000) 854 855 (define-character-encoding :ucs-2le 856 "A 16-bit, fixed-length encoding in which all Unicode 857 characters can be encoded in a single 16-bit word. Input data is assumed 858 to be in little-endian order. Output is also written in little-endian byte 859 order without a leading byte-order mark." 860 :max-units-per-char 1 861 :code-unit-size 16 862 :aliases '(:ucs-2/le) 863 :native-endianness #+little-endian t #+big-endian nil 864 :literal-char-code-limit #x10000 865 :nul-encoding #(0 0)) 866 867 (define-ucs :ucs-2le 2 :le #x10000) 868 869 (define-character-encoding :ucs-2be 870 "A 16-bit, fixed-length encoding in which all Unicode 871 characters can be encoded in a single 16-bit word. Input data is assumed 872 to be in big-endian order. Output is also written in big-endian byte 873 order without a leading byte-order mark." 874 :max-units-per-char 1 875 :code-unit-size 16 876 :aliases '(:ucs-2/be) 877 :native-endianness #+little-endian nil #+big-endian t 878 :literal-char-code-limit #x10000 879 :nul-encoding #(0 0)) 880 881 (define-ucs :ucs-2be 2 :be #x10000)