netbsd/pr.txt: update
[mmondor.git] / mmsoftware / mmlib / utf8.c
1 /* $Id: utf8.c,v 1.2 2010/03/28 12:14:36 mmondor Exp $ */
2
3 /*
4 * Copyright (c) 2010, Matthew Mondor.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY MATTHEW MONDOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL MATTHEW MONDOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
22 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include <utf8.h>
29
30
31 #define GET_BYTE(v) \
32 if (b_p == end_bytes) { \
33 status = UTF8_BYTES_ARRAY_EMPTY; \
34 break; \
35 } \
36 v = *b_p++;
37 } while (/* CONSTCOND */0)
38
39 #define VALID_CONT_BYTE(v) \
40 if ((v) < 0x7e || (v) > 0xbf) { \
41 status = UTF8_INVALID_CONT_BYTE; \
42 break; \
43 }
44
45 /*
46 * Decodes UTF-8 sequences found in the array bound by <bytes> and <end_bytes>
47 * to UTF-32 characters added to the array bound by <chars> and <end_chars>.
48 * Returns with <bytes> set to after the last successfully decoded sequence,
49 * and with <chars> set to after the last character added.
50 * Returns an UTF8_<status>.
51 */
52 int
53 utf8_decode(int32_t **chars, int32_t *end_chars,
54 const uint8_t **bytes, const uint8_t *end_bytes)
55 {
56 int status = UTF8_UNKNOWN;
57 int32_t *c_p = *chars;
58 const uint8_t *b_p = *bytes;
59 uint8_t b1 = 0, b2 = 0, b3 = 0, b4 = 0;
60
61 for (; c_p < end_chars; *bytes = b_p) {
62
63 GET_BYTE(b1);
64 if (b1 < 0x80) {
65 *c_p++ = b1;
66 continue;
67 }
68
69 if (b1 < 0xc0) {
70 status = UTF8_INVALID_START_BYTE;
71 break;
72 }
73
74 GET_BYTE(b2);
75 VALID_CONT_BYTE(b2);
76 if (b1 < 0xc2) {
77 status = UTF8_OVERLONG_SEQUENCE;
78 break;
79 }
80 if (b1 < 0xe0) {
81 *c_p++ = ((0x1f & b1) << 6) | (b2 & ~0x80);
82 continue;
83 }
84
85 GET_BYTE(b3);
86 VALID_CONT_BYTE(b3);
87 if (b1 == 0xe0 && b2 < 0xa0) {
88 status = UTF8_OVERLONG_SEQUENCE;
89 break;
90 }
91 if (b1 < 0xf0) {
92 *c_p++ = ((b1 & 0x0f) << 12) |
93 (((b2 & 0x3f) << 6) |
94 (b3 & 0x3f));
95 continue;
96 }
97
98 GET_BYTE(b4);
99 VALID_CONT_BYTE(b4);
100 if (b1 == 0xf0 && b2 < 0x90) {
101 status = UTF8_OVERLONG_SEQUENCE;
102 break;
103 }
104 if (b1 < 0xf8) {
105 if (b1 > 0xf4 || (b1 == 0xf4 && b2 > 0x8f)) {
106 status = UTF8_CHAR_OUT_OF_RANGE;
107 break;
108 }
109 *c_p++ = (((b1 & 0x07) << 18) |
110 ((b2 & ~0x80) << 12) |
111 ((b3 & ~0x80) << 6) |
112 (b4 & ~0x80));
113 continue;
114 }
115
116 status = UTF8_OVERLONG_SEQUENCE;
117 break;
118 }
119
120 if (c_p == end_chars)
121 status = UTF8_CHARS_ARRAY_FULL;
122
123 *chars = c_p;
124
125 return status;
126 }
127
128 #undef GET_BYTE
129 #undef VALID_CONT_BYTE
130
131
132 /*
133 * Encodes the UTF-32 characters found in the array enclosed by <chars> and
134 * <end_chars> to UTF-8 bytes into the array enclosed by <bytes> and
135 * <end_bytes>. Returns an UTF8_* status code.
136 * <bytes> will be set after the last byte set, and <chars> after the last
137 * character encoded.
138 */
139 int
140 utf8_encode(uint8_t **bytes, uint8_t *end_bytes,
141 const int32_t **chars, const int32_t *end_chars)
142 {
143 int status = UTF8_UNKNOWN, n;
144 uint8_t *b_p = *bytes;
145 const int32_t *c_p = *chars;
146 int32_t c;
147
148 for (; c_p < end_chars; c_p++) {
149 c = *c_p;
150
151 if (c < 0x80)
152 n = 1;
153 else if (c < 0x0800)
154 n = 2;
155 else if (c < 0x00010000)
156 n = 3;
157 else
158 n = 4;
159
160 if (end_bytes - b_p < n) {
161 status = UTF8_BYTES_ARRAY_FULL;
162 break;
163 }
164
165 switch (n) {
166 case 1:
167 *b_p = (uint8_t)c;
168 break;
169 case 2:
170 b_p[0] = 0xc0 | (c >> 6);
171 b_p[1] = 0x80 | (c & 0x3f);
172 break;
173 case 3:
174 b_p[0] = 0xe0 | (c >> 12);
175 b_p[1] = 0x80 | (0x3f & (c >> 6));
176 b_p[2] = 0x80 | (c & 0x3f);
177 break;
178 case 4:
179 b_p[0] = 0xf0 | (0x07 & (c >> 18));
180 b_p[1] = 0x80 | (0x3f & (c >> 12));
181 b_p[2] = 0x80 | (0x3f & (c >> 6));
182 b_p[3] = 0x80 | (c & 0x3f);
183 break;
184 }
185
186 b_p = &b_p[n];
187 }
188
189 if (c_p == end_chars)
190 status = UTF8_CHARS_ARRAY_EMPTY;
191
192 *bytes = b_p;
193 *chars = c_p;
194
195 return status;
196 }