Support dynamic-only virtual hosts by supplying NIL for a VHOST's ROOT.
[mmondor.git] / mmsoftware / cl / server / character.lisp
1 ;;; $Id: character.lisp,v 1.5 2012/09/15 01:52:24 mmondor Exp $
2
3 #|
4
5 Copyright (c) 2011, Matthew Mondor
6 All rights reserved.
7
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions
10 are met:
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13 2. Redistributions in binary form must reproduce the above copyright
14 notice, this list of conditions and the following disclaimer in the
15 documentation and/or other materials provided with the distribution.
16
17 THIS SOFTWARE IS PROVIDED BY MATTHEW MONDOR ``AS IS'' AND ANY EXPRESS OR
18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 IN NO EVENT SHALL MATTHEW MONDOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
23 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 |#
29
30 ;;; character.lisp - Character related utilities
31
32
33 (declaim (optimize (speed 3) (safety 1) (debug 3)))
34
35 (defpackage :character
36 (:use :cl)
37 (:export #:utf-8-string-encode
38 #:utf-8-string-decode
39 #:string-normalize
40 #:character-intervals
41 #:make-valid-character-table
42 #:character-valid-p
43 #:string-valid-p
44 #:member-character-intervals-p))
45
46 (in-package :character)
47
48 (defparameter *rcsid*
49 "$Id: character.lisp,v 1.5 2012/09/15 01:52:24 mmondor Exp $")
50
51
52 ;;; UTF-8
53
54 (defun utf-8-string-encode (string)
55 "Encodes the supplied STRING to an UTF-8 octets vector which it returns."
56 (let ((v (make-array (+ 5 (length string)) ; Best case but we might grow
57 :element-type '(unsigned-byte 8)
58 :adjustable t
59 :fill-pointer 0)))
60 (with-open-stream (s (ext:make-sequence-output-stream
61 v :external-format :utf-8))
62 (loop
63 for c across string
64 do
65 (write-char c s)
66 (let ((d (array-dimension v 0)))
67 (when (< (- d (fill-pointer v)) 5)
68 (adjust-array v (* 2 d))))))
69 v))
70
71 (defun utf-8-string-decode (bytes)
72 "Decodes the UTF-8 octets vector BYTES to string which it returns.
73 Invalid sequence octets are imported as LATIN-1 characters."
74 (macrolet ((add-char (c)
75 `(vector-push-extend ,c string 1024)))
76 (with-open-stream (s (ext:make-sequence-input-stream
77 bytes :external-format :utf-8))
78 (loop
79 with string = (make-array 1024
80 :element-type 'character
81 :adjustable t
82 :fill-pointer 0)
83 for c of-type character =
84 (handler-bind
85 ((ext:stream-decoding-error
86 #'(lambda (e)
87 (mapc #'(lambda (o)
88 ;; Assume LATIN-1 and import
89 (add-char (code-char o)))
90 (ext:character-decoding-error-octets e))
91 (invoke-restart 'continue)))
92 (end-of-file
93 #'(lambda (e)
94 (declare (ignore e))
95 (loop-finish))))
96 (read-char s))
97 do (add-char c)
98 finally (return string)))))
99
100
101 ;;; Normalization to help comparisons
102
103 (defparameter *normalize*
104 '((#\a (#\A
105 #######
106 #######))
107 (#\b (#\B
108 #\Ḃ
109 #\ḃ))
110 (#\c (#\C
111 ##
112 ##))
113 (#\d (#\D
114 ##\Ḋ
115 ##\ḋ))
116 (#\e (#\E
117 #####
118 #####))
119 (#\f (#\F
120 #\Ḟ
121 #\ḟ))
122 (#\g (#\G
123 #
124 #))
125 (#\h (#\H))
126 (#\i (#\I
127 #####
128 #####))
129 (#\j (#\J))
130 (#\k (#\K))
131 (#\l (#\L))
132 (#\m (#\M
133 #\Ṁ
134 #\ṁ
135 #))
136 (#\n (#\N
137 #
138 #))
139 (#\o (#\O
140 ########
141 ########))
142 (#\p (#\P))
143 (#\q (#\Q))
144 (#\r (#\R))
145 (#\s (#\S
146 ##\Ṡ
147 ##\ṡ))
148 (#\t (#\T
149 #\Ṫ
150 #\ṫ))
151 (#\u (#\U
152 ######
153 ######))
154 (#\v (#\V))
155 (#\w (#\W
156 #
157 #))
158 (#\x (#\X))
159 (#\y (#\Y
160 ###
161 ##\ÿ #))
162 (#\z (#\Z
163 #
164 #))
165 ("..." (#\…))
166 ("1/4" (#))
167 ("1/2" (#))
168 ("3/4" (#))
169 ("ae" (##))
170 ("db" (#))
171 ("dz" (###))
172 ("ff" (#\ff))
173 ("ffi" (#\ffi))
174 ("ffl" (#\ffl))
175 ("fi" (#\fi))
176 ("fl" (#\fl))
177 ; ("fŋ" (#\ʩ))
178 ("ft" (#\ſt))
179 ("ij" (##))
180 ("ls" (#))
181 ("lz" (#))
182 ("oe" (##))
183 ("qp" (#))
184 ("ss" (#))
185 ("st" (#\st))
186 ("tc" (#))
187 ("th" (##))
188 ("ts" (#))
189 ; ("tʃ" (#\ʧ))
190 ("ue" (#\ᵫ))
191 ("(c)" (#))
192 ("(r)" (#))
193 ("tm" (#\™))))
194
195 (defun fill-tables ()
196 (let ((v (make-array 256))
197 (ta (make-hash-table :test #'eq)))
198 (loop
199 for i from 0 to 255
200 do (setf (aref v i) (code-char i)))
201 (mapc #'(lambda (l)
202 (destructuring-bind (to from) l
203 (mapc #'(lambda (c)
204 (let ((i (char-code c)))
205 (if (< i 256)
206 (setf (aref v i) to)
207 (setf (gethash c ta) to))))
208 from)))
209 *normalize*)
210 (values v ta)))
211
212 (multiple-value-bind (normalize-vector normalize-table)
213 (fill-tables)
214 (defun char-normalize (c)
215 (let ((i (char-code c)))
216 (if (< i 256)
217 ;; Optimize common LATIN-1 case
218 (aref normalize-vector i)
219 (multiple-value-bind (to exists-p)
220 (gethash c normalize-table)
221 (if exists-p
222 to
223 c))))))
224
225 (defun string-normalize (string)
226 "Returns a copy of STRING normalized for easier matching.
227 The returned string is lowercase, with the various LATIN special characters
228 resolved to simpler ASCII ones. Various ligatures are also converted."
229 (with-output-to-string (out)
230 (with-input-from-string (in string)
231 (loop
232 for c = (handler-case
233 (read-char in)
234 (end-of-file ()
235 nil))
236 for toc = (if c (char-normalize c) nil)
237 while toc
238 when (characterp toc) do (write-char toc out)
239 else do (write-string toc out)))))
240
241
242 ;;; Character classes/ranges
243
244 (defun character-intervals (&rest list)
245 "Generates a string containing all characters within the supplied ranges in
246 LIST. Every supplied element can either be a single character or a set of
247 two characters, the first of which must be smaller than the second."
248 (loop
249 with out = (make-array 16
250 :element-type 'character
251 :adjustable t
252 :fill-pointer 0)
253 for l in list
254 when (listp l)
255 do (destructuring-bind (from-char to-char) l
256 (loop
257 with from-code = (char-code from-char)
258 with to-code = (char-code to-char)
259 for i from from-code to to-code
260 do (vector-push-extend (code-char i) out 16)))
261 else
262 do (vector-push-extend l out 16)
263 finally (return out)))
264
265 (defun make-valid-character-table (string)
266 "Creates a character matching table for use with CHARACTER-VALID-P and
267 STRING-VALID-P. All characters specified in STRING will be considered valid.
268 Note that all supplied characters should be < 256 (ASCII or LATIN-1)."
269 (let ((v (make-array 256
270 :element-type 'boolean
271 :initial-element nil)))
272 (loop
273 for c of-type character across string
274 for i of-type fixnum = (char-code c)
275 do (setf (svref v i) t))
276 v))
277
278 (declaim (inline character-valid-p))
279 (defun character-valid-p (table char)
280 "Given the character matching table TABLE and character CHAR, returns T if
281 CHAR is valid or NIL."
282 (declare (optimize (speed 3) (safety 0)))
283 (let ((i (char-code char)))
284 (declare (type fixnum i))
285 (if (< i 256)
286 (svref table i)
287 nil)))
288
289 (defun string-valid-p (table string)
290 "Given the character matching table TABLE and string STRING, returns T if
291 all characters of STRING are valid, or NIL."
292 (declare (optimize (speed 3) (safety 0)))
293 (loop
294 for c of-type character across string
295 unless (let ((i (char-code c)))
296 (declare (type fixnum i))
297 (if (< i 256)
298 (svref table i)
299 nil))
300 do (return nil)
301 finally (return t)))
302
303 (defmacro member-character-intervals-p (char &rest list)
304 "Utility macro to generate a test for character CHAR matching the
305 character interval(s) specified by LIST, using the same syntax which
306 CHARACTER-INTERVALS expects. This may be more optimized than calling
307 a function or referencing to dynamic symbols bound to tables.
308 CHAR is compared to the specified ranges from largest to shortest range,
309 then to individual characters, if any."
310 (flet ((range-weight (range)
311 (if (listp range)
312 (destructuring-bind (from-char to-char) range
313 (- (char-code to-char) (char-code from-char)))
314 0)))
315 (let ((intervals (sort list #'> :key #'range-weight))
316 (s-code (gensym)))
317 `(let ((,s-code (char-code ,char)))
318 (declare (optimize (speed 3) (safety 0) (debug 0))
319 (type fixnum ,s-code))
320 (or ,@(mapcar #'(lambda (range)
321 (if (listp range)
322 (destructuring-bind (from-char to-char) range
323 `(<= ,(char-code from-char)
324 ,s-code
325 ,(char-code to-char)))
326 `(= ,(char-code range) ,s-code)))
327 intervals))))))