tstrings.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP HTML git clone git://bitreich.org/clic/ git://hg6vgqziawt5s4dj.onion/clic/ DIR Log DIR Files DIR Refs DIR Tags DIR LICENSE --- tstrings.lisp (15562B) --- 1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*- 2 ;;; 3 ;;; strings.lisp --- Conversions between strings and UB8 vectors. 4 ;;; 5 ;;; Copyright (C) 2007, Luis Oliveira <loliveira@common-lisp.net> 6 ;;; 7 ;;; Permission is hereby granted, free of charge, to any person 8 ;;; obtaining a copy of this software and associated documentation 9 ;;; files (the "Software"), to deal in the Software without 10 ;;; restriction, including without limitation the rights to use, copy, 11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies 12 ;;; of the Software, and to permit persons to whom the Software is 13 ;;; furnished to do so, subject to the following conditions: 14 ;;; 15 ;;; The above copyright notice and this permission notice shall be 16 ;;; included in all copies or substantial portions of the Software. 17 ;;; 18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 ;;; DEALINGS IN THE SOFTWARE. 26 27 (in-package #:babel) 28 29 ;;; The usefulness of this string/octets interface of Babel's is very 30 ;;; limited on Lisps with 8-bit characters which will in effect only 31 ;;; support the latin-1 subset of Unicode. That is, all encodings are 32 ;;; supported but we can only store the first 256 code points in Lisp 33 ;;; strings. Support for using other 8-bit encodings for strings on 34 ;;; these Lisps could be added with an extra encoding/decoding step. 35 ;;; Supporting other encodings with larger code units would be silly 36 ;;; (it would break expectations about common string operations) and 37 ;;; better done with something like Closure's runes. 38 39 ;;; Can we handle unicode fully? 40 (eval-when (:compile-toplevel :load-toplevel :execute) 41 ;; The EVAL is just here to avoid warnings... 42 (case (eval char-code-limit) 43 (#x100 (pushnew '8-bit-chars *features*)) 44 (#x10000 (pushnew 'ucs-2-chars *features*)) 45 (#x110000 #| yay |#) 46 ;; This is here mostly because if the CHAR-CODE-LIMIT is bigger 47 ;; than #x11000, strange things might happen but we probably 48 ;; shouldn't descriminate against other, smaller, values. 49 (t (error "Strange CHAR-CODE-LIMIT (#x~X), bailing out." 50 char-code-limit)))) 51 52 ;;; Adapted from Ironclad. TODO: check if it's worthwhile adding 53 ;;; implementation-specific accessors such as SAP-REF-* for SBCL. 54 (defmacro ub-get (vector index &optional (bytes 1) (endianness :ne)) 55 (let ((big-endian (member endianness 56 '(:be #+big-endian :ne #+little-endian :re)))) 57 (once-only (vector index) 58 `(logand 59 ,(1- (ash 1 (* 8 bytes))) 60 (logior 61 ,@(loop for i from 0 below bytes 62 for offset = (if big-endian i (- bytes i 1)) 63 for shift = (if big-endian 64 (* (- bytes i 1) 8) 65 (* offset 8)) 66 collect `(ash (aref ,vector (+ ,index ,offset)) ,shift))))))) 67 68 (defmacro ub-set (value vector index &optional (bytes 1) (endianness :ne)) 69 (let ((big-endian (member endianness 70 '(:be #+big-endian :ne #+little-endian :re)))) 71 `(progn 72 ,@(loop for i from 1 to bytes 73 for offset = (if big-endian (- bytes i) (1- i)) collect 74 `(setf (aref ,vector (+ ,index ,offset)) 75 (ldb (byte 8 ,(* 8 (1- i))) ,value))) 76 (values)))) 77 78 (defmacro string-get (string index) 79 `(char-code (schar ,string ,index))) 80 81 (defmacro string-set (code string index) 82 `(setf (schar ,string ,index) (code-char ,code))) 83 84 ;;; SIMPLE-BASE-STRING would also be a subtype of SIMPLE-STRING so we 85 ;;; don't use that because on SBCL BASE-CHARs can only hold ASCII. 86 ;;; Also, with (> SPEED SAFETY) (setf (schar base-str n) big-char) 87 ;;; will quietly work, sort of. 88 ;;; 89 ;;; XXX: test this on various lisps. 90 91 (defconstant unicode-char-code-limit 92 char-code-limit 93 "An alias for CL:CHAR-CODE-LIMIT which might be lower than 94 #x110000 on some Lisps.") 95 96 (deftype unicode-char () 97 "This character type can hold any characters whose CHAR-CODEs 98 are less than UNICODE-CHAR-CODE-LIMIT." 99 #+lispworks 'lw:simple-char 100 #-lispworks 'character) 101 102 (deftype simple-unicode-string () 103 "Alias for (SIMPLE-ARRAY UNICODE-CHAR (*))." 104 '(simple-array unicode-char (*))) 105 106 (deftype unicode-string () 107 "Alias for (VECTOR UNICODE-CHAR *)." 108 '(vector unicode-char *)) 109 110 (defparameter *string-vector-mappings* 111 (instantiate-concrete-mappings 112 ;; :optimize ((speed 3) (safety 0) (debug 0) (compilation-speed 0)) 113 :octet-seq-setter ub-set 114 :octet-seq-getter ub-get 115 :octet-seq-type (simple-array (unsigned-byte 8) (*)) 116 :code-point-seq-setter string-set 117 :code-point-seq-getter string-get 118 :code-point-seq-type simple-unicode-string)) 119 120 #+sbcl 121 (defparameter *simple-base-string-vector-mappings* 122 (instantiate-concrete-mappings 123 ;; :optimize ((speed 3) (safety 0) (debug 0) (compilation-speed 0)) 124 :instantiate-decoders nil 125 :octet-seq-setter ub-set 126 :octet-seq-getter ub-get 127 :octet-seq-type (simple-array (unsigned-byte 8) (*)) 128 :code-point-seq-setter string-set 129 :code-point-seq-getter string-get 130 :code-point-seq-type simple-base-string)) 131 132 ;;; Do we want a more a specific error condition here? 133 (defun check-vector-bounds (vector start end) 134 (unless (<= 0 start end (length vector)) 135 (error "Invalid start (~A) and end (~A) values for vector of length ~A." 136 start end (length vector)))) 137 138 (defmacro with-simple-vector (((v vector) (s start) (e end)) &body body) 139 "If VECTOR is a displaced or adjustable array, binds V to the 140 underlying simple vector, adds an adequate offset to START and 141 END and binds those offset values to S and E. Otherwise, if 142 VECTOR is already a simple array, it's simply bound to V with no 143 further changes. 144 145 START and END are unchecked and assumed to be within bounds. 146 147 Note that in some Lisps, a slow copying implementation is 148 necessary to obtain a simple vector thus V will be bound to a 149 copy of VECTOR coerced to a simple-vector. Therefore, you 150 shouldn't attempt to modify V." 151 #+sbcl 152 `(sb-kernel:with-array-data ((,v ,vector) (,s ,start) (,e ,end)) 153 ,@body) 154 #+(or cmu scl) 155 `(lisp::with-array-data ((,v ,vector) (,s ,start) (,e ,end)) 156 ,@body) 157 #+openmcl 158 (with-unique-names (offset) 159 `(multiple-value-bind (,v ,offset) 160 (ccl::array-data-and-offset ,vector) 161 (let ((,s (+ ,start ,offset)) 162 (,e (+ ,end ,offset))) 163 ,@body))) 164 #+allegro 165 (with-unique-names (offset) 166 `(excl::with-underlying-simple-vector (,vector ,v ,offset) 167 (let ((,e (+ ,end ,offset)) 168 (,s (+ ,start ,offset))) 169 ,@body))) 170 ;; slow, copying implementation 171 #-(or sbcl cmu scl openmcl allegro) 172 (once-only (vector) 173 `(funcall (if (adjustable-array-p ,vector) 174 #'call-with-array-data/copy 175 #'call-with-array-data/fast) 176 ,vector ,start ,end 177 (lambda (,v ,s ,e) ,@body)))) 178 179 #-(or sbcl cmu scl openmcl allegro) 180 (progn 181 ;; Stolen from f2cl. 182 (defun array-data-and-offset (array) 183 (loop with offset = 0 do 184 (multiple-value-bind (displaced-to index-offset) 185 (array-displacement array) 186 (when (null displaced-to) 187 (return-from array-data-and-offset 188 (values array offset))) 189 (incf offset index-offset) 190 (setf array displaced-to)))) 191 192 (defun call-with-array-data/fast (vector start end fn) 193 (multiple-value-bind (data offset) 194 (array-data-and-offset vector) 195 (funcall fn data (+ offset start) (+ offset end)))) 196 197 (defun call-with-array-data/copy (vector start end fn) 198 (funcall fn (replace (make-array (- end start) :element-type 199 (array-element-type vector)) 200 vector :start2 start :end2 end) 201 0 (- end start)))) 202 203 (defmacro with-checked-simple-vector (((v vector) (s start) (e end)) &body body) 204 "Like WITH-SIMPLE-VECTOR but bound-checks START and END." 205 (once-only (vector start) 206 `(let ((,e (or ,end (length ,vector)))) 207 (check-vector-bounds ,vector ,start ,e) 208 (with-simple-vector ((,v ,vector) (,s ,start) (,e ,e)) 209 ,@body)))) 210 211 ;;; Future features these functions should have: 212 ;;; 213 ;;; * null-terminate 214 ;;; * specify target vector/string + offset 215 ;;; * documentation :) 216 217 (declaim (inline octets-to-string string-to-octets string-size-in-octets 218 vector-size-in-chars concatenate-strings-to-octets 219 bom-vector)) 220 221 (defun octets-to-string (vector &key (start 0) end 222 (errorp (not *suppress-character-coding-errors*)) 223 (encoding *default-character-encoding*)) 224 (check-type vector (vector (unsigned-byte 8))) 225 (with-checked-simple-vector ((vector vector) (start start) (end end)) 226 (declare (type (simple-array (unsigned-byte 8) (*)) vector)) 227 (let ((*suppress-character-coding-errors* (not errorp)) 228 (mapping (lookup-mapping *string-vector-mappings* encoding))) 229 (multiple-value-bind (size new-end) 230 (funcall (code-point-counter mapping) vector start end -1) 231 ;; TODO we could optimize ASCII here: the result should 232 ;; be a simple-base-string filled using code-char... 233 (let ((string (make-string size :element-type 'unicode-char))) 234 (funcall (decoder mapping) vector start new-end string 0) 235 string))))) 236 237 (defun bom-vector (encoding use-bom) 238 (check-type use-bom (member :default t nil)) 239 (the simple-vector 240 (if (null use-bom) 241 #() 242 (let ((enc (typecase encoding 243 (external-format (external-format-encoding encoding)) 244 (t (get-character-encoding encoding))))) 245 (if (or (eq use-bom t) 246 (and (eq use-bom :default) (enc-use-bom enc))) 247 ;; VALUES avoids a "type assertion too complex to check" note. 248 (values (enc-bom-encoding enc)) 249 #()))))) 250 251 (defun string-to-octets (string &key (encoding *default-character-encoding*) 252 (start 0) end (use-bom :default) 253 (errorp (not *suppress-character-coding-errors*))) 254 (declare (optimize (speed 3) (safety 2))) 255 (let ((*suppress-character-coding-errors* (not errorp))) 256 (etypecase string 257 ;; On some lisps (e.g. clisp and ccl) all strings are BASE-STRING and all 258 ;; characters are BASE-CHAR. So, only enable this optimization for 259 ;; selected targets. 260 #+sbcl 261 (simple-base-string 262 (unless end 263 (setf end (length string))) 264 (check-vector-bounds string start end) 265 (let* ((mapping (lookup-mapping *simple-base-string-vector-mappings* 266 encoding)) 267 (bom (bom-vector encoding use-bom)) 268 (bom-length (length bom)) 269 ;; OPTIMIZE: we could use the (length string) information here 270 ;; because it's a simple-base-string where each character <= 127 271 (result (make-array 272 (+ (the array-index 273 (funcall (the function (octet-counter mapping)) 274 string start end -1)) 275 bom-length) 276 :element-type '(unsigned-byte 8)))) 277 (replace result bom) 278 (funcall (the function (encoder mapping)) 279 string start end result bom-length) 280 result)) 281 (string 282 ;; FIXME: we shouldn't really need that coercion to UNICODE-STRING 283 ;; but we kind of because it's declared all over. To avoid that, 284 ;; we'd need different types for input and output strings. Or maybe 285 ;; this is not a problem; figure that out. 286 (with-checked-simple-vector ((string (coerce string 'unicode-string)) 287 (start start) (end end)) 288 (declare (type simple-unicode-string string)) 289 (let* ((mapping (lookup-mapping *string-vector-mappings* encoding)) 290 (bom (bom-vector encoding use-bom)) 291 (bom-length (length bom)) 292 (result (make-array 293 (+ (the array-index 294 (funcall (the function (octet-counter mapping)) 295 string start end -1)) 296 bom-length) 297 :element-type '(unsigned-byte 8)))) 298 (replace result bom) 299 (funcall (the function (encoder mapping)) 300 string start end result bom-length) 301 result)))))) 302 303 (defun concatenate-strings-to-octets (encoding &rest strings) 304 "Optimized equivalent of 305 \(string-to-octets \(apply #'concatenate 'string strings) 306 :encoding encoding)" 307 (declare (dynamic-extent strings)) 308 (let* ((mapping (lookup-mapping *string-vector-mappings* encoding)) 309 (octet-counter (octet-counter mapping)) 310 (vector (make-array 311 (the array-index 312 (reduce #'+ strings 313 :key (lambda (string) 314 (funcall octet-counter 315 string 0 (length string) -1)))) 316 :element-type '(unsigned-byte 8))) 317 (current-index 0)) 318 (declare (type array-index current-index)) 319 (dolist (string strings) 320 (check-type string string) 321 (with-checked-simple-vector ((string (coerce string 'unicode-string)) 322 (start 0) (end (length string))) 323 (declare (type simple-unicode-string string)) 324 (incf current-index 325 (funcall (encoder mapping) 326 string start end vector current-index)))) 327 vector)) 328 329 (defun string-size-in-octets (string &key (start 0) end (max -1 maxp) 330 (errorp (not *suppress-character-coding-errors*)) 331 (encoding *default-character-encoding*)) 332 (check-type string string) 333 (with-checked-simple-vector ((string (coerce string 'unicode-string)) 334 (start start) (end end)) 335 (declare (type simple-unicode-string string)) 336 (let ((mapping (lookup-mapping *string-vector-mappings* encoding)) 337 (*suppress-character-coding-errors* (not errorp))) 338 (when maxp (assert (plusp max))) 339 (funcall (octet-counter mapping) string start end max)))) 340 341 (defun vector-size-in-chars (vector &key (start 0) end (max -1 maxp) 342 (errorp (not *suppress-character-coding-errors*)) 343 (encoding *default-character-encoding*)) 344 (check-type vector (vector (unsigned-byte 8))) 345 (with-checked-simple-vector ((vector vector) (start start) (end end)) 346 (declare (type (simple-array (unsigned-byte 8) (*)) vector)) 347 (let ((mapping (lookup-mapping *string-vector-mappings* encoding)) 348 (*suppress-character-coding-errors* (not errorp))) 349 (when maxp (assert (plusp max))) 350 (funcall (code-point-counter mapping) vector start end max)))) 351 352 (declaim (notinline octets-to-string string-to-octets string-size-in-octets 353 vector-size-in-chars concatenate-strings-to-octets))