crow-httpd: switch from CVS RCSID to GIT revision for versioning
[mmondor.git] / mmsoftware / cl / server / character.lisp
CommitLineData
de8171b2 1;;; $Id: character.lisp,v 1.6 2013/07/02 19:42:06 mmondor Exp $
bf849755
MM
2
3#|
4
5Copyright (c) 2011, Matthew Mondor
6All rights reserved.
7
8Redistribution and use in source and binary forms, with or without
9modification, are permitted provided that the following conditions
10are met:
111. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
132. Redistributions in binary form must reproduce the above copyright
14 notice, this list of conditions and the following disclaimer in the
15 documentation and/or other materials provided with the distribution.
16
17THIS SOFTWARE IS PROVIDED BY MATTHEW MONDOR ``AS IS'' AND ANY EXPRESS OR
18IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20IN NO EVENT SHALL MATTHEW MONDOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
23USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28|#
29
9985a105 30;;; character.lisp - Character related utilities
bf849755
MM
31
32
33(declaim (optimize (speed 3) (safety 1) (debug 3)))
34
35(defpackage :character
36 (:use :cl)
37 (:export #:utf-8-string-encode
38 #:utf-8-string-decode
9985a105
MM
39 #:string-normalize
40 #:character-intervals
41 #:make-valid-character-table
42 #:character-valid-p
914e314a
MM
43 #:string-valid-p
44 #:member-character-intervals-p))
bf849755
MM
45
46(in-package :character)
47
bf849755
MM
48
49
9985a105
MM
50;;; UTF-8
51
bf849755
MM
52(defun utf-8-string-encode (string)
53 "Encodes the supplied STRING to an UTF-8 octets vector which it returns."
54 (let ((v (make-array (+ 5 (length string)) ; Best case but we might grow
55 :element-type '(unsigned-byte 8)
56 :adjustable t
57 :fill-pointer 0)))
58 (with-open-stream (s (ext:make-sequence-output-stream
59 v :external-format :utf-8))
60 (loop
61 for c across string
62 do
63 (write-char c s)
64 (let ((d (array-dimension v 0)))
65 (when (< (- d (fill-pointer v)) 5)
66 (adjust-array v (* 2 d))))))
67 v))
68
69(defun utf-8-string-decode (bytes)
70 "Decodes the UTF-8 octets vector BYTES to string which it returns.
71Invalid sequence octets are imported as LATIN-1 characters."
72 (macrolet ((add-char (c)
73 `(vector-push-extend ,c string 1024)))
74 (with-open-stream (s (ext:make-sequence-input-stream
75 bytes :external-format :utf-8))
76 (loop
77 with string = (make-array 1024
78 :element-type 'character
79 :adjustable t
80 :fill-pointer 0)
81 for c of-type character =
82 (handler-bind
83 ((ext:stream-decoding-error
84 #'(lambda (e)
85 (mapc #'(lambda (o)
86 ;; Assume LATIN-1 and import
87 (add-char (code-char o)))
88 (ext:character-decoding-error-octets e))
89 (invoke-restart 'continue)))
90 (end-of-file
91 #'(lambda (e)
92 (declare (ignore e))
93 (loop-finish))))
94 (read-char s))
95 do (add-char c)
96 finally (return string)))))
97
98
de8171b2
MM
99;;; Mapping to help comparisons.
100;;; XXX There is no actual combining characters normalization going on here,
101;;; although it could be useful, and would need to be done before this step,
102;;; resolving to special characters mapped by this table.
9985a105 103
bf849755
MM
104(defparameter *normalize*
105 '((#\a (#\A
de8171b2
MM
106 #\À #\Á #\Â #\Ã #\Ä #\Å #\Ā #\Ă #\Ắ #\Ằ #\Ẳ #\Ẵ #\Ặ #\Ǎ
107 #\à #\á #\â #\ã #\ä #\å #\ā #\ă #\ắ #\ằ #\ẳ #\ẵ #\ặ #\ǎ))
bf849755
MM
108 (#\b (#\B
109 #\Ḃ
110 #\ḃ))
111 (#\c (#\C
112 #\Ç #\Ċ
113 #\ç #\ċ))
114 (#\d (#\D
115 #\Ð #\Ḋ
116 #\ð #\ḋ))
117 (#\e (#\E
de8171b2
MM
118 #\È #\É #\Ê #\Ë #\Ē #\Ĕ #\Ḝ #\Ě
119 #\è #\é #\ê #\ë #\ē #\ĕ #\ḝ #\ě))
bf849755
MM
120 (#\f (#\F
121 #\Ḟ
122 #\ḟ))
123 (#\g (#\G
de8171b2
MM
124 #\Ġ #\Ğ
125 #\ġ #\ğ))
126 (#\h (#\H
127 #\Ḫ
128 #\ḫ))
bf849755 129 (#\i (#\I
de8171b2
MM
130 #\Ì #\Í #\Î #\Ï #\Ī #\Ĭ #\Ǐ
131 #\ì #\í #\î #\ï #\ī #\ĭ #\ǐ))
bf849755
MM
132 (#\j (#\J))
133 (#\k (#\K))
134 (#\l (#\L))
135 (#\m (#\M
136 #\Ṁ
137 #\ṁ
138 #\µ))
139 (#\n (#\N
140 #\Ñ
141 #\ñ))
142 (#\o (#\O
de8171b2
MM
143 #\Ò #\Ó #\Ô #\Õ #\Ö #\Ø #\Ő #\Ō #\Ŏ #\Ǒ
144 #\ò #\ó #\ô #\õ #\ö #\ø #\ő #\ō #\ŏ #\ǒ))
bf849755
MM
145 (#\p (#\P))
146 (#\q (#\Q))
147 (#\r (#\R))
148 (#\s (#\S
149 #\Š #\Ṡ
150 #\š #\ṡ))
151 (#\t (#\T
152 #\Ṫ
153 #\ṫ))
154 (#\u (#\U
de8171b2
MM
155 #\Ù #\Ú #\Û #\Ü #\Ű #\Ū #\Ŭ #\Ǔ
156 #\ù #\ú #\û #\ü #\ű #\ū #\ŭ #\ǔ))
157 (#\v (#\V #\Ѵ #\Ѷ))
bf849755
MM
158 (#\w (#\W
159 #\Ŵ
160 #\ŵ))
161 (#\x (#\X))
162 (#\y (#\Y
de8171b2
MM
163 #\Ý #\Ÿ #\Ŷ #\Ў
164 #\ý #\ÿ #\ŷ #\ў))
bf849755
MM
165 (#\z (#\Z
166 #\Ž
167 #\ž))
de8171b2
MM
168 (#\" (#\« #\»))
169 (#\- (#\U2010 #\U2212))
170 (#\' (#\U2018 #\U02BC #\U02C8 #\᾿))
171 (#\` (#\U02CB #\U2019 #\῾))
172 (#\| (#\U2223))
173 (#\/ (#\U2044 #\U2215))
174 (#\\ (#\U2216))
175 (#\: (#\U2236))
176 (#\< (#\U2039 #\U2329 #\U3008))
177 (#\> (#\U203A #\U232A #\U3009))
178 (#\^ (#\U02C4 #\U02C6 #\U2038 #\U2303))
179 (#\_ (#\U02CD))
180 (#\~ (#\U223C))
d155320d 181 ("..." (#\…))
bf849755
MM
182 ("1/4" (#\¼))
183 ("1/2" (#\½))
184 ("3/4" (#\¾))
185 ("ae" (#\Æ #\æ))
186 ("db" (#\ȸ))
187 ("dz" (#\ʣ #\ʥ #\ʤ))
188 ("ff" (#\ff))
189 ("ffi" (#\ffi))
190 ("ffl" (#\ffl))
191 ("fi" (#\fi))
192 ("fl" (#\fl))
de8171b2 193 ("fn" (#\ʩ)) ; "fŋ"
bf849755
MM
194 ("ft" (#\ſt))
195 ("ij" (#\IJ #\ij))
196 ("ls" (#\ʪ))
197 ("lz" (#\ʫ))
198 ("oe" (#\Œ #\œ))
199 ("qp" (#\ȹ))
200 ("ss" (#\ß))
201 ("st" (#\st))
202 ("tc" (#\ʨ))
203 ("th" (#\Þ #\þ))
204 ("ts" (#\ʦ))
de8171b2 205 ("tf" (#\ʧ)) ; "tʃ"
bf849755 206 ("ue" (#\ᵫ))
de8171b2 207 ("(a)" (#\Ⓐ))
bf849755
MM
208 ("(c)" (#\©))
209 ("(r)" (#\®))
210 ("tm" (#\™))))
211
212(defun fill-tables ()
213 (let ((v (make-array 256))
214 (ta (make-hash-table :test #'eq)))
215 (loop
216 for i from 0 to 255
217 do (setf (aref v i) (code-char i)))
218 (mapc #'(lambda (l)
219 (destructuring-bind (to from) l
220 (mapc #'(lambda (c)
221 (let ((i (char-code c)))
222 (if (< i 256)
223 (setf (aref v i) to)
224 (setf (gethash c ta) to))))
225 from)))
226 *normalize*)
227 (values v ta)))
228
229(multiple-value-bind (normalize-vector normalize-table)
230 (fill-tables)
231 (defun char-normalize (c)
232 (let ((i (char-code c)))
233 (if (< i 256)
234 ;; Optimize common LATIN-1 case
235 (aref normalize-vector i)
236 (multiple-value-bind (to exists-p)
237 (gethash c normalize-table)
238 (if exists-p
239 to
240 c))))))
241
242(defun string-normalize (string)
243 "Returns a copy of STRING normalized for easier matching.
244The returned string is lowercase, with the various LATIN special characters
245resolved to simpler ASCII ones. Various ligatures are also converted."
246 (with-output-to-string (out)
247 (with-input-from-string (in string)
248 (loop
249 for c = (handler-case
250 (read-char in)
251 (end-of-file ()
252 nil))
253 for toc = (if c (char-normalize c) nil)
254 while toc
255 when (characterp toc) do (write-char toc out)
256 else do (write-string toc out)))))
9985a105
MM
257
258
259;;; Character classes/ranges
260
261(defun character-intervals (&rest list)
262 "Generates a string containing all characters within the supplied ranges in
263LIST. Every supplied element can either be a single character or a set of
264two characters, the first of which must be smaller than the second."
265 (loop
266 with out = (make-array 16
267 :element-type 'character
268 :adjustable t
269 :fill-pointer 0)
270 for l in list
271 when (listp l)
272 do (destructuring-bind (from-char to-char) l
273 (loop
274 with from-code = (char-code from-char)
275 with to-code = (char-code to-char)
276 for i from from-code to to-code
277 do (vector-push-extend (code-char i) out 16)))
278 else
279 do (vector-push-extend l out 16)
280 finally (return out)))
281
282(defun make-valid-character-table (string)
283 "Creates a character matching table for use with CHARACTER-VALID-P and
284STRING-VALID-P. All characters specified in STRING will be considered valid.
285Note that all supplied characters should be < 256 (ASCII or LATIN-1)."
286 (let ((v (make-array 256
287 :element-type 'boolean
288 :initial-element nil)))
289 (loop
290 for c of-type character across string
291 for i of-type fixnum = (char-code c)
292 do (setf (svref v i) t))
293 v))
294
1e339593 295(declaim (inline character-valid-p))
9985a105
MM
296(defun character-valid-p (table char)
297 "Given the character matching table TABLE and character CHAR, returns T if
298CHAR is valid or NIL."
299 (declare (optimize (speed 3) (safety 0)))
300 (let ((i (char-code char)))
301 (declare (type fixnum i))
302 (if (< i 256)
303 (svref table i)
304 nil)))
305
306(defun string-valid-p (table string)
307 "Given the character matching table TABLE and string STRING, returns T if
308all characters of STRING are valid, or NIL."
309 (declare (optimize (speed 3) (safety 0)))
310 (loop
311 for c of-type character across string
312 unless (let ((i (char-code c)))
313 (declare (type fixnum i))
314 (if (< i 256)
315 (svref table i)
316 nil))
317 do (return nil)
318 finally (return t)))
914e314a
MM
319
320(defmacro member-character-intervals-p (char &rest list)
321 "Utility macro to generate a test for character CHAR matching the
322character interval(s) specified by LIST, using the same syntax which
323CHARACTER-INTERVALS expects. This may be more optimized than calling
324a function or referencing to dynamic symbols bound to tables.
325CHAR is compared to the specified ranges from largest to shortest range,
326then to individual characters, if any."
327 (flet ((range-weight (range)
328 (if (listp range)
329 (destructuring-bind (from-char to-char) range
330 (- (char-code to-char) (char-code from-char)))
331 0)))
332 (let ((intervals (sort list #'> :key #'range-weight))
333 (s-code (gensym)))
334 `(let ((,s-code (char-code ,char)))
335 (declare (optimize (speed 3) (safety 0) (debug 0))
336 (type fixnum ,s-code))
337 (or ,@(mapcar #'(lambda (range)
338 (if (listp range)
339 (destructuring-bind (from-char to-char) range
340 `(<= ,(char-code from-char)
341 ,s-code
342 ,(char-code to-char)))
343 `(= ,(char-code range) ,s-code)))
344 intervals))))))