tencodings.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP HTML git clone git://bitreich.org/clic/ git://hg6vgqziawt5s4dj.onion/clic/ DIR Log DIR Files DIR Refs DIR Tags DIR LICENSE --- tencodings.lisp (22651B) --- 1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*- 2 ;;; 3 ;;; encodings.lisp --- Character encodings and mappings. 4 ;;; 5 ;;; Copyright (C) 2007, Luis Oliveira <loliveira@common-lisp.net> 6 ;;; 7 ;;; Permission is hereby granted, free of charge, to any person 8 ;;; obtaining a copy of this software and associated documentation 9 ;;; files (the "Software"), to deal in the Software without 10 ;;; restriction, including without limitation the rights to use, copy, 11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies 12 ;;; of the Software, and to permit persons to whom the Software is 13 ;;; furnished to do so, subject to the following conditions: 14 ;;; 15 ;;; The above copyright notice and this permission notice shall be 16 ;;; included in all copies or substantial portions of the Software. 17 ;;; 18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 ;;; DEALINGS IN THE SOFTWARE. 26 27 (in-package #:babel-encodings) 28 29 ;;;; Character Encodings 30 31 (defclass character-encoding () 32 ((name :initarg :name :reader enc-name 33 :initform (error "Must specify a NAME for this character encoding.")) 34 ;; Most of these documentation strings are taken from OpenMCL. 35 (documentation 36 :initarg :documentation :reader enc-documentation :initform nil) 37 ;; A non-exhaustive list of aliases for the encoding. 38 (aliases :initarg :aliases :initform nil :reader enc-aliases) 39 ;; Specified in bits. Usually 8, 16 or 32. 40 (code-unit-size 41 :initarg :code-unit-size :reader enc-code-unit-size :initform 8) 42 (max-units-per-char 43 :initarg :max-units-per-char :reader enc-max-units-per-char :initform 1) 44 ;; If NIL, it is necessary to swap 16- and 32-bit units. 45 (native-endianness 46 :initarg :native-endianness :reader enc-native-endianness :initform t) 47 ;; Code units less than this value map to themselves on input. 48 (decode-literal-code-unit-limit 49 :initarg :decode-literal-code-unit-limit :initform 0 50 :reader enc-decode-literal-code-unit-limit) 51 ;; Code points less than this value map to themselves on output. 52 (encode-literal-code-unit-limit 53 :initarg :encode-literal-code-unit-limit :initform 0 54 :reader enc-encode-literal-code-unit-limit) 55 ;; Defines whether it is necessary to prepend a byte-order-mark to 56 ;; determine the endianness. 57 (use-bom :initarg :use-bom :initform nil :reader enc-use-bom) 58 ;; How the byte-order-mark should be encoded, specified as a 59 ;; sequence of octets. NIL if it cannot be encoded. 60 (bom-encoding 61 :initarg :bom-encoding :reader enc-bom-encoding :initform nil) 62 ;; How should NUL be encoded, specified as sequence of octets. 63 (nul-encoding 64 :initarg :nul-encoding :reader enc-nul-encoding :initform #(0)) 65 ;; Preferred replacement character code point. 66 (default-replacement 67 :initarg :default-replacement :reader enc-default-replacement 68 :initform #x1a) 69 ;; Does VALID-STRING => OCTETS => STRING2 guarantee a valid 70 ;; STRING2? UTF-{16,32} on little-endian plaforms don't because 71 ;; they assume different endianness on each direction. 72 (ambiguous 73 :initarg :ambiguous :reader ambiguous-encoding-p :initform nil))) 74 75 ;;; I'm too lazy to write all the identical limits twice. 76 (defmethod initialize-instance :after ((enc character-encoding) 77 &key literal-char-code-limit) 78 (when literal-char-code-limit 79 (setf (slot-value enc 'encode-literal-code-unit-limit) 80 literal-char-code-limit) 81 (setf (slot-value enc 'decode-literal-code-unit-limit) 82 literal-char-code-limit))) 83 84 #-(and) 85 (defmethod describe-object ((enc character-encoding) s) 86 "Prints out the name, aliases and documentation slots of a 87 character encoding object." 88 (with-slots (name aliases documentation) enc 89 (format s "~&~S" name) 90 (when aliases 91 (format s " [Aliases:~{ ~S~}]" aliases)) 92 (format s "~&~A~%~%" documentation)) 93 (call-next-method)) 94 95 (defvar *supported-character-encodings* nil) 96 97 (defun list-character-encodings () 98 "List of keyword symbols denoting supported character 99 encodings. This list does not include aliases." 100 *supported-character-encodings*) 101 102 (defvar *character-encodings* (make-hash-table :test 'eq)) 103 104 (defvar *default-character-encoding* :utf-8 105 "Special variable used to determine the default character 106 encoding.") 107 108 (defun get-character-encoding (name) 109 "Lookups the character encoding denoted by the keyword symbol 110 NAME. Signals an error if one is not found. If NAME is already 111 a CHARACTER-ENCONDING object, it is returned unmodified." 112 (when (typep name 'character-encoding) 113 (return-from get-character-encoding name)) 114 (when (eq name :default) 115 (setq name *default-character-encoding*)) 116 (or (gethash name *character-encodings*) 117 (error "Unknown character encoding: ~S" name))) 118 119 (defmethod ambiguous-encoding-p ((encoding symbol)) 120 (ambiguous-encoding-p (get-character-encoding encoding))) 121 122 (defun notice-character-encoding (enc) 123 (pushnew (enc-name enc) *supported-character-encodings*) 124 (dolist (kw (cons (enc-name enc) (enc-aliases enc))) 125 (setf (gethash kw *character-encodings*) enc)) 126 (enc-name enc)) 127 128 (defmacro define-character-encoding (name docstring &body options) 129 `(notice-character-encoding 130 (make-instance 'character-encoding :name ,name ,@options 131 :documentation ,docstring))) 132 133 ;;;; Mappings 134 135 ;;; TODO: describe what mappings are 136 137 (defun make-fixed-width-counter (getter type &optional (unit-size-in-bits 8)) 138 (declare (ignore getter type)) 139 (check-type unit-size-in-bits positive-fixnum) 140 (let ((unit-size-in-bytes (/ unit-size-in-bits 8))) 141 `(named-lambda fixed-width-counter (seq start end max) 142 (declare (ignore seq) (fixnum start end max)) 143 ;; XXX: the result can be bigger than a fixnum when (> unit-size 144 ;; 1) and we don't want that to happen. Possible solution: signal 145 ;; a warning (hmm, make that an actual error) and truncate. 146 (if (plusp max) 147 (let ((count (the fixnum (min (floor max ,unit-size-in-bytes) 148 (the fixnum (- end start)))))) 149 (values (the fixnum (* count ,unit-size-in-bytes)) 150 (the fixnum (+ start count)))) 151 (values (the fixnum (* (the fixnum (- end start)) 152 ,unit-size-in-bytes)) 153 (the fixnum end)))))) 154 155 ;;; Useful to develop new encodings incrementally starting with octet 156 ;;; and code-unit counters. 157 (defun make-dummy-coder (sg st ds dt) 158 (declare (ignore sg st ds dt)) 159 `(named-lambda dummy-coder (src s e dest i) 160 (declare (ignore src s e dest i)) 161 (error "this encoder/decoder hasn't been implemented yet"))) 162 163 ;;; TODO: document here 164 ;;; 165 ;;; ENCODER -- (lambda (src-getter src-type dest-setter dest-type) ...) 166 ;;; DECODER -- (lambda (src-getter src-type dest-setter dest-type) ...) 167 ;;; 168 ;;; OCTET-COUNTER -- (lambda (getter type) ...) 169 ;;; CODE-POINT-COUNTER -- (lambda (getter type) ...) 170 (defclass abstract-mapping () 171 ((encoder-factory :accessor encoder-factory :initform 'make-dummy-coder) 172 (decoder-factory :accessor decoder-factory :initform 'make-dummy-coder) 173 (octet-counter-factory :accessor octet-counter-factory 174 :initform 'make-fixed-width-counter) 175 (code-point-counter-factory :accessor code-point-counter-factory 176 :initform 'make-fixed-width-counter))) 177 178 ;;; TODO: document these 179 ;;; 180 ;;; ENCODER -- (lambda (src start end dest d-start) ...) 181 ;;; DECODER -- (lambda (src start end dest d-start) ...) 182 ;;; 183 ;;; OCTET-COUNTER -- (lambda (seq start end max-octets) ...) 184 ;;; CODE-POINT-COUNTER -- (lambda (seq start end max-chars) ...) 185 ;;; => N-CHARS NEW-END 186 ;;; (important: describe NEW-END) 187 (defclass concrete-mapping () 188 ((encoder :accessor encoder) 189 (decoder :accessor decoder) 190 (octet-counter :accessor octet-counter) 191 (code-point-counter :accessor code-point-counter))) 192 193 (defparameter *abstract-mappings* (make-hash-table :test 'eq)) 194 195 (defun get-abstract-mapping (encoding) 196 (gethash encoding *abstract-mappings*)) 197 198 (defun (setf get-abstract-mapping) (value encoding) 199 (setf (gethash encoding *abstract-mappings*) value)) 200 201 (defun %register-mapping-part (encoding slot-name fn) 202 (let ((mapping (get-abstract-mapping encoding))) 203 (unless mapping 204 (setq mapping (make-instance 'abstract-mapping)) 205 (setf (get-abstract-mapping encoding) mapping)) 206 (setf (slot-value mapping slot-name) fn))) 207 208 ;;; See enc-*.lisp for example usages of these 4 macros. 209 210 (defmacro define-encoder (encoding (sa st da dt) &body body) 211 `(%register-mapping-part ,encoding 'encoder-factory 212 (named-lambda encoder (,sa ,st ,da ,dt) 213 ,@body))) 214 215 (defmacro define-decoder (encoding (sa st da dt) &body body) 216 `(%register-mapping-part ,encoding 'decoder-factory 217 (named-lambda decoder (,sa ,st ,da ,dt) 218 ,@body))) 219 220 (defmacro define-octet-counter (encoding (acc type) &body body) 221 `(%register-mapping-part ,encoding 'octet-counter-factory 222 (named-lambda octet-counter-factory (,acc ,type) 223 ,@body))) 224 225 (defmacro define-code-point-counter (encoding (acc type) &body body) 226 `(%register-mapping-part ,encoding 'code-point-counter-factory 227 (named-lambda code-point-counter (,acc ,type) 228 ,@body))) 229 230 (defun instantiate-encoder (encoding am octet-seq-getter octet-seq-type 231 code-point-seq-setter code-point-seq-type) 232 (declare (ignore encoding)) 233 (funcall (encoder-factory am) 234 octet-seq-getter 235 octet-seq-type 236 code-point-seq-setter 237 code-point-seq-type)) 238 239 (defun instantiate-decoder (encoding am octet-seq-getter octet-seq-type 240 code-point-seq-setter code-point-seq-type) 241 (declare (ignore encoding)) 242 (funcall (decoder-factory am) 243 octet-seq-getter 244 octet-seq-type 245 code-point-seq-setter 246 code-point-seq-type)) 247 248 (defun instantiate-code-point-counter (encoding am octet-seq-getter 249 octet-seq-type) 250 (declare (ignore encoding)) 251 (funcall (code-point-counter-factory am) 252 octet-seq-getter 253 octet-seq-type)) 254 255 (defun instantiate-octet-counter (encoding am code-point-seq-getter 256 code-point-seq-type) 257 (if (= 1 (enc-max-units-per-char encoding)) 258 (make-fixed-width-counter code-point-seq-getter code-point-seq-type 259 (enc-code-unit-size encoding)) 260 (funcall (octet-counter-factory am) 261 code-point-seq-getter 262 code-point-seq-type))) 263 264 ;;; Expands into code generated by the available abstract mappings 265 ;;; that will be compiled into concrete mappings. This is used in 266 ;;; e.g. strings.lisp to define mappings between strings and 267 ;;; (unsigned-byte 8) vectors. 268 ;;; 269 ;;; For each encoding funcall the abstract mappings at macro-expansion 270 ;;; time with the src/dest accessors and types to generate the 271 ;;; appropriate code for the concrete mappings. These functions are 272 ;;; then saved in their respective slots of the CONCRETE-MAPPING 273 ;;; object. 274 (defmacro instantiate-concrete-mappings 275 (&key (encodings (hash-table-keys *abstract-mappings*)) 276 (optimize '((speed 3) (debug 0) (compilation-speed 0))) 277 octet-seq-getter octet-seq-setter octet-seq-type 278 code-point-seq-getter code-point-seq-setter code-point-seq-type 279 (instantiate-decoders t)) 280 `(let ((ht (make-hash-table :test 'eq))) 281 (declare (optimize ,@optimize) 282 #+sbcl (sb-ext:muffle-conditions sb-ext:compiler-note)) 283 (flet ((notice-mapping (encoding-name cm) 284 (let* ((encoding (get-character-encoding encoding-name)) 285 (aliases (enc-aliases encoding))) 286 (dolist (kw (cons (enc-name encoding) aliases)) 287 (setf (gethash kw ht) cm))))) 288 ,@(loop for encoding-name in encodings 289 for encoding = (get-character-encoding encoding-name) 290 for am = (gethash encoding-name *abstract-mappings*) 291 collect 292 `(let ((cm (make-instance 'concrete-mapping))) 293 (setf (encoder cm) 294 ,(instantiate-encoder encoding am 295 code-point-seq-getter 296 code-point-seq-type 297 octet-seq-setter 298 octet-seq-type)) 299 ,(when instantiate-decoders 300 `(progn 301 (setf (decoder cm) 302 ,(instantiate-decoder encoding am 303 octet-seq-getter 304 octet-seq-type 305 code-point-seq-setter 306 code-point-seq-type)) 307 (setf (code-point-counter cm) 308 ,(instantiate-code-point-counter 309 encoding am octet-seq-getter octet-seq-type)))) 310 (setf (octet-counter cm) 311 ,(instantiate-octet-counter encoding am 312 code-point-seq-getter 313 code-point-seq-type)) 314 (notice-mapping ,encoding-name cm)))) 315 ht)) 316 317 ;;; debugging stuff 318 319 #-(and) 320 (defun pprint-instantiate-concrete-mappings 321 (&key (encodings (hash-table-keys *abstract-mappings*)) 322 (optimize '((debug 3) (safety 3))) 323 (octet-seq-setter 'ub-set) (octet-seq-getter 'ub-get) 324 (octet-seq-type '(simple-array (unsigned-byte 8) (*))) 325 (code-point-seq-setter 'string-set) 326 (code-point-seq-getter 'string-get) 327 (code-point-seq-type 'simple-unicode-string)) 328 (let ((encodings (ensure-list encodings)) 329 (*package* (find-package :babel-encodings)) 330 (*print-case* :downcase)) 331 (pprint 332 (macroexpand 333 `(instantiate-concrete-mappings 334 :encodings ,encodings 335 :optimize ,optimize 336 :octet-seq-getter ,octet-seq-getter 337 :octet-seq-setter ,octet-seq-setter 338 :octet-seq-type ,octet-seq-type 339 :code-point-seq-getter ,code-point-seq-getter 340 :code-point-seq-setter ,code-point-seq-setter 341 :code-point-seq-type ,code-point-seq-type)))) 342 (values)) 343 344 ;;;; Utilities used in enc-*.lisp 345 346 (defconstant +default-substitution-code-point+ #x1a 347 "Default ASCII substitution character code point used in case of an encoding/decoding error.") 348 349 ;;; We're converting between objects of the (UNSIGNED-BYTE 8) and 350 ;;; (MOD #x110000) types which are aliased here to UB8 and CODE-POINT 351 ;;; for convenience. 352 (deftype ub8 () '(unsigned-byte 8)) 353 (deftype code-point () '(mod #x110000)) 354 355 ;;; Utility macro around DEFINE-ENCODER that takes care of most of the 356 ;;; work need to deal with an 8-bit, fixed-width character encoding. 357 ;;; 358 ;;; BODY will be inside a loop and its return value will placed in the 359 ;;; destination buffer. BODY will be surounded by lexical BLOCK which 360 ;;; will have the ENCODING's name, usually a keyword. It handles all 361 ;;; sorts of type declarations. 362 ;;; 363 ;;; See enc-ascii.lisp for a simple usage example. 364 (defmacro define-unibyte-encoder (encoding (code) &body body) 365 (with-unique-names (s-getter s-type d-setter d-type 366 src start end dest d-start i di) 367 `(define-encoder ,encoding (,s-getter ,s-type ,d-setter ,d-type) 368 `(named-lambda ,',(symbolicate encoding '#:-unibyte-encoder) 369 (,',src ,',start ,',end ,',dest ,',d-start) 370 (declare (type ,,s-type ,',src) 371 (type ,,d-type ,',dest) 372 (fixnum ,',start ,',end ,',d-start)) 373 (loop for ,',i fixnum from ,',start below ,',end 374 and ,',di fixnum from ,',d-start do 375 (,,d-setter 376 (macrolet 377 ;; this should probably be a function... 378 ((handle-error (&optional (c ''character-encoding-error)) 379 `(encoding-error 380 ,',',code ,',',encoding ,',',src ,',',i 381 +default-substitution-code-point+ ,c))) 382 (let ((,',code (,,s-getter ,',src ,',i))) 383 (declare (type code-point ,',code)) 384 (block ,',encoding ,@',body))) 385 ,',dest ,',di) 386 finally (return (the fixnum (- ,',di ,',d-start)))))))) 387 388 ;;; The decoder version of the above macro. 389 (defmacro define-unibyte-decoder (encoding (octet) &body body) 390 (with-unique-names (s-getter s-type d-setter d-type 391 src start end dest d-start i di) 392 `(define-decoder ,encoding (,s-getter ,s-type ,d-setter ,d-type) 393 `(named-lambda ,',(symbolicate encoding '#:-unibyte-encoder) 394 (,',src ,',start ,',end ,',dest ,',d-start) 395 (declare (type ,,s-type ,',src) 396 (type ,,d-type ,',dest) 397 (fixnum ,',start ,',end ,',d-start)) 398 (loop for ,',i fixnum from ,',start below ,',end 399 and ,',di fixnum from ,',d-start do 400 (,,d-setter 401 (macrolet 402 ;; this should probably be a function... 403 ((handle-error (&optional (c ''character-decoding-error)) 404 `(decoding-error 405 (vector ,',',octet) ,',',encoding ,',',src ,',',i 406 +default-substitution-code-point+ ,c))) 407 (let ((,',octet (,,s-getter ,',src ,',i))) 408 (declare (type ub8 ,',octet)) 409 (block ,',encoding ,@',body))) 410 ,',dest ,',di) 411 finally (return (the fixnum (- ,',di ,',d-start)))))))) 412 413 ;;;; Error Conditions 414 ;;; 415 ;;; For now, we don't define any actual restarts. The only mechanism 416 ;;; for "restarting" a coding error is the 417 ;;; *SUPPRESS-CHARACTER-CODING-ERRORS* special variable which, when 418 ;;; bound to T (the default), suppresses any error and uses a default 419 ;;; replacement character instead. 420 ;;; 421 ;;; If it turns out that other more options are necessary, possible 422 ;;; alternative approaches include: 423 ;;; 424 ;;; a) use a *REPLACEMENT-CHARACTER* special variable that lets us 425 ;;; pick our own replacement character. The encoder must do 426 ;;; additional work to check if this is character is encodable. 427 ;;; 428 ;;; b) offer a restart to pick a replacement character. Same 429 ;;; problem as above. 430 ;;; 431 ;;; Both approaches pose encoding problems when dealing with a 432 ;;; variable-width encodings because different replacement characters 433 ;;; will need different numbers of octets. This is not a problem for 434 ;;; UTF but will be a problem for the CJK charsets. Approach (a) is 435 ;;; nevertheless easier since the replacement character is known in 436 ;;; advance and therefore the octet-counter can account for it. 437 ;;; 438 ;;; For more complex restarts like SBCL's -- that'll let you specify 439 ;;; _several_ replacement characters for a single character error -- 440 ;;; will probably need extra support code outside the encoder/decoder 441 ;;; (i.e. in the string-to-octets function, for example) since the 442 ;;; encoders/decoders deal with pre-allocated fixed-length buffers. 443 ;;; 444 ;;; SBCL has ASCII-specific (MALFORMED-ASCII) and UTF8-specific 445 ;;; errors. Why? Do we want to add some of those too? 446 447 ;;; FIXME: We used to deal with this with an extra ERRORP argument for 448 ;;; encoders, decoders, etc... Still undecided on the best way to do 449 ;;; it. We could also use a simple restart instead of this... 450 ;;; 451 ;;; In any case, this is not for the users to bind and it's not 452 ;;; exported from the BABEL package. 453 (defvar *suppress-character-coding-errors* nil 454 "If non-NIL, encoding or decoding errors are suppressed and the 455 the current character encoding's default replacement character is 456 used.") 457 458 ;;; All of Babel's error conditions are subtypes of 459 ;;; CHARACTER-CODING-ERROR. This error hierarchy is based on SBCL's. 460 (define-condition character-coding-error (error) 461 ((buffer :initarg :buffer :reader character-coding-error-buffer) 462 (position :initarg :position :reader character-coding-error-position) 463 (encoding :initarg :encoding :reader character-coding-error-encoding))) 464 465 (define-condition character-encoding-error (character-coding-error) 466 ((code :initarg :code :reader character-encoding-error-code)) 467 (:report (lambda (c s) 468 (format s "Unable to encode character code point ~A as ~S." 469 (character-encoding-error-code c) 470 (character-coding-error-encoding c))))) 471 472 (declaim (inline encoding-error)) 473 (defun encoding-error (code enc buf pos &optional 474 (sub +default-substitution-code-point+) 475 (e 'character-encoding-error)) 476 (unless *suppress-character-coding-errors* 477 (error e :encoding enc :buffer buf :position pos :code code)) 478 sub) 479 480 (define-condition character-decoding-error (character-coding-error) 481 ((octets :initarg :octets :reader character-decoding-error-octets)) 482 (:report (lambda (c s) 483 (format s "Illegal ~S character starting at position ~D." 484 (character-coding-error-encoding c) 485 (character-coding-error-position c))))) 486 487 (define-condition end-of-input-in-character (character-decoding-error) 488 () 489 (:documentation "Signalled by DECODERs or CODE-POINT-COUNTERs 490 of variable-width character encodings.")) 491 492 (define-condition character-out-of-range (character-decoding-error) 493 () 494 (:documentation 495 "Signalled when the character being decoded is out of range.")) 496 497 (declaim (inline decoding-error)) 498 (defun decoding-error (octets enc buf pos &optional 499 (sub +default-substitution-code-point+) 500 (e 'character-decoding-error)) 501 (unless *suppress-character-coding-errors* 502 (error e :octets octets :encoding enc :buffer buf :position pos)) 503 sub)