Initial import
[mmondor.git] / mmsoftware / cl / server / character.lisp
1 ;;; $Id: character.lisp,v 1.1 2011/08/30 08:01:36 mmondor Exp $
2
3 #|
4
5 Copyright (c) 2011, Matthew Mondor
6 All rights reserved.
7
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions
10 are met:
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13 2. Redistributions in binary form must reproduce the above copyright
14 notice, this list of conditions and the following disclaimer in the
15 documentation and/or other materials provided with the distribution.
16
17 THIS SOFTWARE IS PROVIDED BY MATTHEW MONDOR ``AS IS'' AND ANY EXPRESS OR
18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 IN NO EVENT SHALL MATTHEW MONDOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
23 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 |#
29
30 ;;; character.lisp - Character conversion utilities
31
32
33 (declaim (optimize (speed 3) (safety 1) (debug 3)))
34
35 (defpackage :character
36 (:use :cl)
37 (:export #:utf-8-string-encode
38 #:utf-8-string-decode
39 #:string-normalize))
40
41 (in-package :character)
42
43 (defparameter *rcsid*
44 "$Id: character.lisp,v 1.1 2011/08/30 08:01:36 mmondor Exp $")
45
46
47 (defun utf-8-string-encode (string)
48 "Encodes the supplied STRING to an UTF-8 octets vector which it returns."
49 (let ((v (make-array (+ 5 (length string)) ; Best case but we might grow
50 :element-type '(unsigned-byte 8)
51 :adjustable t
52 :fill-pointer 0)))
53 (with-open-stream (s (ext:make-sequence-output-stream
54 v :external-format :utf-8))
55 (loop
56 for c across string
57 do
58 (write-char c s)
59 (let ((d (array-dimension v 0)))
60 (when (< (- d (fill-pointer v)) 5)
61 (adjust-array v (* 2 d))))))
62 v))
63
64 (defun utf-8-string-decode (bytes)
65 "Decodes the UTF-8 octets vector BYTES to string which it returns.
66 Invalid sequence octets are imported as LATIN-1 characters."
67 (macrolet ((add-char (c)
68 `(vector-push-extend ,c string 1024)))
69 (with-open-stream (s (ext:make-sequence-input-stream
70 bytes :external-format :utf-8))
71 (loop
72 with string = (make-array 1024
73 :element-type 'character
74 :adjustable t
75 :fill-pointer 0)
76 for c of-type character =
77 (handler-bind
78 ((ext:stream-decoding-error
79 #'(lambda (e)
80 (mapc #'(lambda (o)
81 ;; Assume LATIN-1 and import
82 (add-char (code-char o)))
83 (ext:character-decoding-error-octets e))
84 (invoke-restart 'continue)))
85 (end-of-file
86 #'(lambda (e)
87 (declare (ignore e))
88 (loop-finish))))
89 (read-char s))
90 do (add-char c)
91 finally (return string)))))
92
93
94 (defparameter *normalize*
95 '((#\a (#\A
96 #######
97 #######))
98 (#\b (#\B
99 #\Ḃ
100 #\ḃ))
101 (#\c (#\C
102 ##
103 ##))
104 (#\d (#\D
105 ##\Ḋ
106 ##\ḋ))
107 (#\e (#\E
108 #####
109 #####))
110 (#\f (#\F
111 #\Ḟ
112 #\ḟ))
113 (#\g (#\G
114 #
115 #))
116 (#\h (#\H))
117 (#\i (#\I
118 #####
119 #####))
120 (#\j (#\J))
121 (#\k (#\K))
122 (#\l (#\L))
123 (#\m (#\M
124 #\Ṁ
125 #\ṁ
126 #))
127 (#\n (#\N
128 #
129 #))
130 (#\o (#\O
131 ########
132 ########))
133 (#\p (#\P))
134 (#\q (#\Q))
135 (#\r (#\R))
136 (#\s (#\S
137 ##\Ṡ
138 ##\ṡ))
139 (#\t (#\T
140 #\Ṫ
141 #\ṫ))
142 (#\u (#\U
143 ######
144 ######))
145 (#\v (#\V))
146 (#\w (#\W
147 #
148 #))
149 (#\x (#\X))
150 (#\y (#\Y
151 ###
152 ##\ÿ #))
153 (#\z (#\Z
154 #
155 #))
156 ("1/4" (#))
157 ("1/2" (#))
158 ("3/4" (#))
159 ("ae" (##))
160 ("db" (#))
161 ("dz" (###))
162 ("ff" (#\ff))
163 ("ffi" (#\ffi))
164 ("ffl" (#\ffl))
165 ("fi" (#\fi))
166 ("fl" (#\fl))
167 ; ("fŋ" (#\ʩ))
168 ("ft" (#\ſt))
169 ("ij" (##))
170 ("ls" (#))
171 ("lz" (#))
172 ("oe" (##))
173 ("qp" (#))
174 ("ss" (#))
175 ("st" (#\st))
176 ("tc" (#))
177 ("th" (##))
178 ("ts" (#))
179 ; ("tʃ" (#\ʧ))
180 ("ue" (#\ᵫ))
181 ("(c)" (#))
182 ("(r)" (#))
183 ("tm" (#\™))))
184
185 (defun fill-tables ()
186 (let ((v (make-array 256))
187 (ta (make-hash-table :test #'eq)))
188 (loop
189 for i from 0 to 255
190 do (setf (aref v i) (code-char i)))
191 (mapc #'(lambda (l)
192 (destructuring-bind (to from) l
193 (mapc #'(lambda (c)
194 (let ((i (char-code c)))
195 (if (< i 256)
196 (setf (aref v i) to)
197 (setf (gethash c ta) to))))
198 from)))
199 *normalize*)
200 (values v ta)))
201
202 (multiple-value-bind (normalize-vector normalize-table)
203 (fill-tables)
204 (defun char-normalize (c)
205 (let ((i (char-code c)))
206 (if (< i 256)
207 ;; Optimize common LATIN-1 case
208 (aref normalize-vector i)
209 (multiple-value-bind (to exists-p)
210 (gethash c normalize-table)
211 (if exists-p
212 to
213 c))))))
214
215 (defun string-normalize (string)
216 "Returns a copy of STRING normalized for easier matching.
217 The returned string is lowercase, with the various LATIN special characters
218 resolved to simpler ASCII ones. Various ligatures are also converted."
219 (with-output-to-string (out)
220 (with-input-from-string (in string)
221 (loop
222 for c = (handler-case
223 (read-char in)
224 (end-of-file ()
225 nil))
226 for toc = (if c (char-normalize c) nil)
227 while toc
228 when (characterp toc) do (write-char toc out)
229 else do (write-string toc out)))))