tstrings.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP
HTML git clone git://bitreich.org/clic/ git://hg6vgqziawt5s4dj.onion/clic/
DIR Log
DIR Files
DIR Refs
DIR Tags
DIR LICENSE
---
tstrings.lisp (15562B)
---
1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
2 ;;;
3 ;;; strings.lisp --- Conversions between strings and UB8 vectors.
4 ;;;
5 ;;; Copyright (C) 2007, Luis Oliveira <loliveira@common-lisp.net>
6 ;;;
7 ;;; Permission is hereby granted, free of charge, to any person
8 ;;; obtaining a copy of this software and associated documentation
9 ;;; files (the "Software"), to deal in the Software without
10 ;;; restriction, including without limitation the rights to use, copy,
11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
12 ;;; of the Software, and to permit persons to whom the Software is
13 ;;; furnished to do so, subject to the following conditions:
14 ;;;
15 ;;; The above copyright notice and this permission notice shall be
16 ;;; included in all copies or substantial portions of the Software.
17 ;;;
18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 ;;; DEALINGS IN THE SOFTWARE.
26
27 (in-package #:babel)
28
29 ;;; The usefulness of this string/octets interface of Babel's is very
30 ;;; limited on Lisps with 8-bit characters which will in effect only
31 ;;; support the latin-1 subset of Unicode. That is, all encodings are
32 ;;; supported but we can only store the first 256 code points in Lisp
33 ;;; strings. Support for using other 8-bit encodings for strings on
34 ;;; these Lisps could be added with an extra encoding/decoding step.
35 ;;; Supporting other encodings with larger code units would be silly
36 ;;; (it would break expectations about common string operations) and
37 ;;; better done with something like Closure's runes.
38
39 ;;; Can we handle unicode fully?
40 (eval-when (:compile-toplevel :load-toplevel :execute)
41 ;; The EVAL is just here to avoid warnings...
42 (case (eval char-code-limit)
43 (#x100 (pushnew '8-bit-chars *features*))
44 (#x10000 (pushnew 'ucs-2-chars *features*))
45 (#x110000 #| yay |#)
46 ;; This is here mostly because if the CHAR-CODE-LIMIT is bigger
47 ;; than #x11000, strange things might happen but we probably
48 ;; shouldn't descriminate against other, smaller, values.
49 (t (error "Strange CHAR-CODE-LIMIT (#x~X), bailing out."
50 char-code-limit))))
51
52 ;;; Adapted from Ironclad. TODO: check if it's worthwhile adding
53 ;;; implementation-specific accessors such as SAP-REF-* for SBCL.
54 (defmacro ub-get (vector index &optional (bytes 1) (endianness :ne))
55 (let ((big-endian (member endianness
56 '(:be #+big-endian :ne #+little-endian :re))))
57 (once-only (vector index)
58 `(logand
59 ,(1- (ash 1 (* 8 bytes)))
60 (logior
61 ,@(loop for i from 0 below bytes
62 for offset = (if big-endian i (- bytes i 1))
63 for shift = (if big-endian
64 (* (- bytes i 1) 8)
65 (* offset 8))
66 collect `(ash (aref ,vector (+ ,index ,offset)) ,shift)))))))
67
68 (defmacro ub-set (value vector index &optional (bytes 1) (endianness :ne))
69 (let ((big-endian (member endianness
70 '(:be #+big-endian :ne #+little-endian :re))))
71 `(progn
72 ,@(loop for i from 1 to bytes
73 for offset = (if big-endian (- bytes i) (1- i)) collect
74 `(setf (aref ,vector (+ ,index ,offset))
75 (ldb (byte 8 ,(* 8 (1- i))) ,value)))
76 (values))))
77
78 (defmacro string-get (string index)
79 `(char-code (schar ,string ,index)))
80
81 (defmacro string-set (code string index)
82 `(setf (schar ,string ,index) (code-char ,code)))
83
84 ;;; SIMPLE-BASE-STRING would also be a subtype of SIMPLE-STRING so we
85 ;;; don't use that because on SBCL BASE-CHARs can only hold ASCII.
86 ;;; Also, with (> SPEED SAFETY) (setf (schar base-str n) big-char)
87 ;;; will quietly work, sort of.
88 ;;;
89 ;;; XXX: test this on various lisps.
90
91 (defconstant unicode-char-code-limit
92 char-code-limit
93 "An alias for CL:CHAR-CODE-LIMIT which might be lower than
94 #x110000 on some Lisps.")
95
96 (deftype unicode-char ()
97 "This character type can hold any characters whose CHAR-CODEs
98 are less than UNICODE-CHAR-CODE-LIMIT."
99 #+lispworks 'lw:simple-char
100 #-lispworks 'character)
101
102 (deftype simple-unicode-string ()
103 "Alias for (SIMPLE-ARRAY UNICODE-CHAR (*))."
104 '(simple-array unicode-char (*)))
105
106 (deftype unicode-string ()
107 "Alias for (VECTOR UNICODE-CHAR *)."
108 '(vector unicode-char *))
109
110 (defparameter *string-vector-mappings*
111 (instantiate-concrete-mappings
112 ;; :optimize ((speed 3) (safety 0) (debug 0) (compilation-speed 0))
113 :octet-seq-setter ub-set
114 :octet-seq-getter ub-get
115 :octet-seq-type (simple-array (unsigned-byte 8) (*))
116 :code-point-seq-setter string-set
117 :code-point-seq-getter string-get
118 :code-point-seq-type simple-unicode-string))
119
120 #+sbcl
121 (defparameter *simple-base-string-vector-mappings*
122 (instantiate-concrete-mappings
123 ;; :optimize ((speed 3) (safety 0) (debug 0) (compilation-speed 0))
124 :instantiate-decoders nil
125 :octet-seq-setter ub-set
126 :octet-seq-getter ub-get
127 :octet-seq-type (simple-array (unsigned-byte 8) (*))
128 :code-point-seq-setter string-set
129 :code-point-seq-getter string-get
130 :code-point-seq-type simple-base-string))
131
132 ;;; Do we want a more a specific error condition here?
133 (defun check-vector-bounds (vector start end)
134 (unless (<= 0 start end (length vector))
135 (error "Invalid start (~A) and end (~A) values for vector of length ~A."
136 start end (length vector))))
137
138 (defmacro with-simple-vector (((v vector) (s start) (e end)) &body body)
139 "If VECTOR is a displaced or adjustable array, binds V to the
140 underlying simple vector, adds an adequate offset to START and
141 END and binds those offset values to S and E. Otherwise, if
142 VECTOR is already a simple array, it's simply bound to V with no
143 further changes.
144
145 START and END are unchecked and assumed to be within bounds.
146
147 Note that in some Lisps, a slow copying implementation is
148 necessary to obtain a simple vector thus V will be bound to a
149 copy of VECTOR coerced to a simple-vector. Therefore, you
150 shouldn't attempt to modify V."
151 #+sbcl
152 `(sb-kernel:with-array-data ((,v ,vector) (,s ,start) (,e ,end))
153 ,@body)
154 #+(or cmu scl)
155 `(lisp::with-array-data ((,v ,vector) (,s ,start) (,e ,end))
156 ,@body)
157 #+openmcl
158 (with-unique-names (offset)
159 `(multiple-value-bind (,v ,offset)
160 (ccl::array-data-and-offset ,vector)
161 (let ((,s (+ ,start ,offset))
162 (,e (+ ,end ,offset)))
163 ,@body)))
164 #+allegro
165 (with-unique-names (offset)
166 `(excl::with-underlying-simple-vector (,vector ,v ,offset)
167 (let ((,e (+ ,end ,offset))
168 (,s (+ ,start ,offset)))
169 ,@body)))
170 ;; slow, copying implementation
171 #-(or sbcl cmu scl openmcl allegro)
172 (once-only (vector)
173 `(funcall (if (adjustable-array-p ,vector)
174 #'call-with-array-data/copy
175 #'call-with-array-data/fast)
176 ,vector ,start ,end
177 (lambda (,v ,s ,e) ,@body))))
178
179 #-(or sbcl cmu scl openmcl allegro)
180 (progn
181 ;; Stolen from f2cl.
182 (defun array-data-and-offset (array)
183 (loop with offset = 0 do
184 (multiple-value-bind (displaced-to index-offset)
185 (array-displacement array)
186 (when (null displaced-to)
187 (return-from array-data-and-offset
188 (values array offset)))
189 (incf offset index-offset)
190 (setf array displaced-to))))
191
192 (defun call-with-array-data/fast (vector start end fn)
193 (multiple-value-bind (data offset)
194 (array-data-and-offset vector)
195 (funcall fn data (+ offset start) (+ offset end))))
196
197 (defun call-with-array-data/copy (vector start end fn)
198 (funcall fn (replace (make-array (- end start) :element-type
199 (array-element-type vector))
200 vector :start2 start :end2 end)
201 0 (- end start))))
202
203 (defmacro with-checked-simple-vector (((v vector) (s start) (e end)) &body body)
204 "Like WITH-SIMPLE-VECTOR but bound-checks START and END."
205 (once-only (vector start)
206 `(let ((,e (or ,end (length ,vector))))
207 (check-vector-bounds ,vector ,start ,e)
208 (with-simple-vector ((,v ,vector) (,s ,start) (,e ,e))
209 ,@body))))
210
211 ;;; Future features these functions should have:
212 ;;;
213 ;;; * null-terminate
214 ;;; * specify target vector/string + offset
215 ;;; * documentation :)
216
217 (declaim (inline octets-to-string string-to-octets string-size-in-octets
218 vector-size-in-chars concatenate-strings-to-octets
219 bom-vector))
220
221 (defun octets-to-string (vector &key (start 0) end
222 (errorp (not *suppress-character-coding-errors*))
223 (encoding *default-character-encoding*))
224 (check-type vector (vector (unsigned-byte 8)))
225 (with-checked-simple-vector ((vector vector) (start start) (end end))
226 (declare (type (simple-array (unsigned-byte 8) (*)) vector))
227 (let ((*suppress-character-coding-errors* (not errorp))
228 (mapping (lookup-mapping *string-vector-mappings* encoding)))
229 (multiple-value-bind (size new-end)
230 (funcall (code-point-counter mapping) vector start end -1)
231 ;; TODO we could optimize ASCII here: the result should
232 ;; be a simple-base-string filled using code-char...
233 (let ((string (make-string size :element-type 'unicode-char)))
234 (funcall (decoder mapping) vector start new-end string 0)
235 string)))))
236
237 (defun bom-vector (encoding use-bom)
238 (check-type use-bom (member :default t nil))
239 (the simple-vector
240 (if (null use-bom)
241 #()
242 (let ((enc (typecase encoding
243 (external-format (external-format-encoding encoding))
244 (t (get-character-encoding encoding)))))
245 (if (or (eq use-bom t)
246 (and (eq use-bom :default) (enc-use-bom enc)))
247 ;; VALUES avoids a "type assertion too complex to check" note.
248 (values (enc-bom-encoding enc))
249 #())))))
250
251 (defun string-to-octets (string &key (encoding *default-character-encoding*)
252 (start 0) end (use-bom :default)
253 (errorp (not *suppress-character-coding-errors*)))
254 (declare (optimize (speed 3) (safety 2)))
255 (let ((*suppress-character-coding-errors* (not errorp)))
256 (etypecase string
257 ;; On some lisps (e.g. clisp and ccl) all strings are BASE-STRING and all
258 ;; characters are BASE-CHAR. So, only enable this optimization for
259 ;; selected targets.
260 #+sbcl
261 (simple-base-string
262 (unless end
263 (setf end (length string)))
264 (check-vector-bounds string start end)
265 (let* ((mapping (lookup-mapping *simple-base-string-vector-mappings*
266 encoding))
267 (bom (bom-vector encoding use-bom))
268 (bom-length (length bom))
269 ;; OPTIMIZE: we could use the (length string) information here
270 ;; because it's a simple-base-string where each character <= 127
271 (result (make-array
272 (+ (the array-index
273 (funcall (the function (octet-counter mapping))
274 string start end -1))
275 bom-length)
276 :element-type '(unsigned-byte 8))))
277 (replace result bom)
278 (funcall (the function (encoder mapping))
279 string start end result bom-length)
280 result))
281 (string
282 ;; FIXME: we shouldn't really need that coercion to UNICODE-STRING
283 ;; but we kind of because it's declared all over. To avoid that,
284 ;; we'd need different types for input and output strings. Or maybe
285 ;; this is not a problem; figure that out.
286 (with-checked-simple-vector ((string (coerce string 'unicode-string))
287 (start start) (end end))
288 (declare (type simple-unicode-string string))
289 (let* ((mapping (lookup-mapping *string-vector-mappings* encoding))
290 (bom (bom-vector encoding use-bom))
291 (bom-length (length bom))
292 (result (make-array
293 (+ (the array-index
294 (funcall (the function (octet-counter mapping))
295 string start end -1))
296 bom-length)
297 :element-type '(unsigned-byte 8))))
298 (replace result bom)
299 (funcall (the function (encoder mapping))
300 string start end result bom-length)
301 result))))))
302
303 (defun concatenate-strings-to-octets (encoding &rest strings)
304 "Optimized equivalent of
305 \(string-to-octets \(apply #'concatenate 'string strings)
306 :encoding encoding)"
307 (declare (dynamic-extent strings))
308 (let* ((mapping (lookup-mapping *string-vector-mappings* encoding))
309 (octet-counter (octet-counter mapping))
310 (vector (make-array
311 (the array-index
312 (reduce #'+ strings
313 :key (lambda (string)
314 (funcall octet-counter
315 string 0 (length string) -1))))
316 :element-type '(unsigned-byte 8)))
317 (current-index 0))
318 (declare (type array-index current-index))
319 (dolist (string strings)
320 (check-type string string)
321 (with-checked-simple-vector ((string (coerce string 'unicode-string))
322 (start 0) (end (length string)))
323 (declare (type simple-unicode-string string))
324 (incf current-index
325 (funcall (encoder mapping)
326 string start end vector current-index))))
327 vector))
328
329 (defun string-size-in-octets (string &key (start 0) end (max -1 maxp)
330 (errorp (not *suppress-character-coding-errors*))
331 (encoding *default-character-encoding*))
332 (check-type string string)
333 (with-checked-simple-vector ((string (coerce string 'unicode-string))
334 (start start) (end end))
335 (declare (type simple-unicode-string string))
336 (let ((mapping (lookup-mapping *string-vector-mappings* encoding))
337 (*suppress-character-coding-errors* (not errorp)))
338 (when maxp (assert (plusp max)))
339 (funcall (octet-counter mapping) string start end max))))
340
341 (defun vector-size-in-chars (vector &key (start 0) end (max -1 maxp)
342 (errorp (not *suppress-character-coding-errors*))
343 (encoding *default-character-encoding*))
344 (check-type vector (vector (unsigned-byte 8)))
345 (with-checked-simple-vector ((vector vector) (start start) (end end))
346 (declare (type (simple-array (unsigned-byte 8) (*)) vector))
347 (let ((mapping (lookup-mapping *string-vector-mappings* encoding))
348 (*suppress-character-coding-errors* (not errorp)))
349 (when maxp (assert (plusp max)))
350 (funcall (code-point-counter mapping) vector start end max))))
351
352 (declaim (notinline octets-to-string string-to-octets string-size-in-octets
353 vector-size-in-chars concatenate-strings-to-octets))