|
|
1.1 root 1: /*
2: * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3: *
4: * @APPLE_LICENSE_HEADER_START@
5: *
6: * The contents of this file constitute Original Code as defined in and
7: * are subject to the Apple Public Source License Version 1.1 (the
8: * "License"). You may not use this file except in compliance with the
9: * License. Please obtain a copy of the License at
10: * http://www.apple.com/publicsource and read it before using this file.
11: *
12: * This Original Code and all software distributed under the License are
13: * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14: * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15: * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16: * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17: * License for the specific language governing rights and limitations
18: * under the License.
19: *
20: * @APPLE_LICENSE_HEADER_END@
21: */
22: /* ================================================================ */
23: /*
24: File: ConvertUTF.c
25: Author: Mark E. Davis
26: Copyright (C) 1994 Taligent, Inc. All rights reserved.
27:
28: This code is copyrighted. Under the copyright laws, this code may not
29: be copied, in whole or part, without prior written consent of Taligent.
30:
31: Taligent grants the right to use or reprint this code as long as this
32: ENTIRE copyright notice is reproduced in the code or reproduction.
33: The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES,
34: EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED
35: WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN
36: NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING,
37: WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS
38: INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
39: LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN
40: IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
41: BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
42: LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE
43: LIMITATION MAY NOT APPLY TO YOU.
44:
45: RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the
46: government is subject to restrictions as set forth in subparagraph
47: (c)(l)(ii) of the Rights in Technical Data and Computer Software
48: clause at DFARS 252.227-7013 and FAR 52.227-19.
49:
50: This code may be protected by one or more U.S. and International
51: Patents.
52:
53: TRADEMARKS: Taligent and the Taligent Design Mark are registered
54: trademarks of Taligent, Inc.
55:
56: HISTORY:
57:
58: 22-Jan-1999 Don Brady Add decomposition to ConvertUTF8toUTF16.
59: 17-Nov-1998 Don Brady Add ":" to "/" conversions.
60: */
61: /* ================================================================ */
62:
63: #include "ConvertUTF.h"
64:
65: /* ================================================================ */
66:
67: const int halfShift = 10;
68: const UCS4 halfBase = 0x0010000UL;
69: const UCS4 halfMask = 0x3FFUL;
70: const UCS4 kSurrogateHighStart = 0xD800UL;
71: const UCS4 kSurrogateHighEnd = 0xDBFFUL;
72: const UCS4 kSurrogateLowStart = 0xDC00UL;
73: const UCS4 kSurrogateLowEnd = 0xDFFFUL;
74:
75: const UCS4 kReplacementCharacter = 0x0000FFFDUL;
76: const UCS4 kMaximumUCS2 = 0x0000FFFFUL;
77: const UCS4 kMaximumUTF16 = 0x0010FFFFUL;
78: const UCS4 kMaximumUCS4 = 0x7FFFFFFFUL;
79:
80: /* ================================================================ */
81:
82: UCS4 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL,
83: 0x03C82080UL, 0xFA082080UL, 0x82082080UL};
84: char bytesFromUTF8[256] = {
85: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
86: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
87: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
88: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
89: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
90: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
91: 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
92: 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};
93:
94: UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
95:
96: static UTF16 Decomposer(register UTF16 srcChar, UTF16 *cmbChar);
97:
98: /* ================================================================ */
99: /* This code is similar in effect to making successive calls on the
100: * mbtowc and wctomb routines in FSS-UTF. However, it is considerably
101: * different in code:
102: * it is adapted to be consistent with UTF16,
103: * the interface converts a whole buffer to avoid function-call overhead
104: * constants have been gathered.
105: * loops & conditionals have been removed as much as possible for
106: * efficiency, in favor of drop-through switch statements.
107: */
108:
109: /*
110: * Colons vs. Slash
111: *
112: * The VFS layer uses a "/" as a pathname separator but HFS disks
113: * use a ":". So when converting from UTF-8, ":" characters need
114: * to be changed to "/" so that colons don't end up on HFS disks.
115: * Likewise when converting into UTF-8, "/" characters need to be
116: * changed to ":" so that a "/" in a filename is not returned
117: * through the VFS layer.
118: *
119: * We do not need to worry about full-width slash or colons since
120: * their respective representations outside of Unicode are never
121: * the 7-bit versions (0x2f or 0x3a).
122: */
123:
124:
125: /* ================================================================ */
126: ConversionResult ConvertUTF16toUTF8 (
127: UTF16** sourceStart, const UTF16* sourceEnd,
128: UTF8** targetStart, const UTF8* targetEnd)
129: {
130: ConversionResult result = ok;
131: register UTF16* source = *sourceStart;
132: register UTF8* target = *targetStart;
133: while (source < sourceEnd) {
134: register UCS4 ch;
135: register unsigned short bytesToWrite;
136: register const UCS4 byteMask = 0xBF;
137: register const UCS4 byteMark = 0x80;
138: register const UCS4 slash = '/';
139:
140: ch = *source++;
141:
142: /* optimize for ASCII case... */
143: if (ch < 0x80) {
144: if (ch == slash)
145: ch = ':'; /* VFS doesn't like slash */
146:
147: if (target >= targetEnd) {
148: result = targetExhausted;
149: break;
150: }
151: if (ch == 0) {
152: continue; /* skip over embedded NULLs */
153: }
154:
155: *target++ = ch;
156: continue;
157: } else if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
158: && source < sourceEnd) {
159: register UCS4 ch2 = *source;
160: if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
161: ch = ((ch - kSurrogateHighStart) << halfShift)
162: + (ch2 - kSurrogateLowStart) + halfBase;
163: ++source;
164: };
165: };
166:
167: if (ch < 0x80) { bytesToWrite = 1;
168: } else if (ch < 0x800) { bytesToWrite = 2;
169: } else if (ch < 0x10000) { bytesToWrite = 3;
170: } else if (ch < 0x200000) { bytesToWrite = 4;
171: } else if (ch < 0x4000000) { bytesToWrite = 5;
172: } else if (ch <= kMaximumUCS4){ bytesToWrite = 6;
173: } else { bytesToWrite = 2;
174: ch = kReplacementCharacter;
175: }; /* I wish there were a smart way to avoid this conditional */
176:
177: target += bytesToWrite;
178: if (target > targetEnd) {
179: target -= bytesToWrite; result = targetExhausted; break;
180: };
181: switch (bytesToWrite) { /* note: code falls through cases! */
182: case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6;
183: case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6;
184: case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
185: case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
186: case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
187: case 1: *--target = ch | firstByteMark[bytesToWrite];
188: };
189: target += bytesToWrite;
190: };
191: *sourceStart = source;
192: *targetStart = target;
193: return result;
194: };
195:
196: /* ================================================================ */
197:
198: ConversionResult ConvertUTF8toUTF16 (
199: UTF8** sourceStart, UTF8* sourceEnd,
200: UTF16** targetStart, const UTF16* targetEnd)
201: {
202: ConversionResult result = ok;
203: register UTF8* source = *sourceStart;
204: register UTF16* target = *targetStart;
205:
206: while (source < sourceEnd) {
207: UTF8 byte;
208: register UCS4 ch;
209: register unsigned short extraBytesToWrite;
210:
211: /* optimize for ASCII case...*/
212: byte = *source;
213: if (byte < 128) {
214: if (byte == ':')
215: byte = '/'; /* HFS doesn't like colons */
216: source++;
217: *target++ = byte;
218: continue;
219: }
220:
221: extraBytesToWrite = bytesFromUTF8[*source];
222: if (source + extraBytesToWrite > sourceEnd) {
223: result = sourceExhausted; break;
224: };
225: ch = 0;
226: switch(extraBytesToWrite) { /* note: code falls through cases! */
227: case 5: ch += *source++; ch <<= 6;
228: case 4: ch += *source++; ch <<= 6;
229: case 3: ch += *source++; ch <<= 6;
230: case 2: ch += *source++; ch <<= 6;
231: case 1: ch += *source++; ch <<= 6;
232: case 0: ch += *source++;
233: };
234: ch -= offsetsFromUTF8[extraBytesToWrite];
235:
236: if (target >= targetEnd) {
237: result = targetExhausted; break;
238: };
239: if (ch <= kMaximumUCS2) {
240: UTF16 combine;
241:
242: *target++ = Decomposer(ch, &combine);
243:
244: if (combine) {
245: if (target >= targetEnd) {
246: result = targetExhausted; break;
247: };
248: *target++ = combine;
249: }
250: } else if (ch > kMaximumUTF16) {
251: *target++ = kReplacementCharacter;
252: } else {
253: if (target + 1 >= targetEnd) {
254: result = targetExhausted; break;
255: };
256: ch -= halfBase;
257: *target++ = (ch >> halfShift) + kSurrogateHighStart;
258: *target++ = (ch & halfMask) + kSurrogateLowStart;
259: };
260: };
261: *sourceStart = source;
262: *targetStart = target;
263: return result;
264: };
265:
266: /*
267: * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
268: * primary_char yields first decomposed char. If this
269: * char is an alpha char then get the combining char
270: * from the combining_char table and add 0x0300 to it.
271: */
272:
273: static unsigned char primary_char[64] = {
274: 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43,
275:
276: 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49,
277:
278: 0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7,
279:
280: 0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF,
281:
282: 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63,
283:
284: 0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69,
285:
286: 0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7,
287:
288: 0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79,
289: };
290:
291: static unsigned char combining_char[64] = {
292: 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
293:
294: 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
295:
296: 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
297:
298: 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF,
299:
300: 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
301:
302: 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
303:
304: 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
305:
306: 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08
307: };
308:
309:
310: static const unsigned long __CJKDecompBitmap[] = { // 0x3000 ~ 0x30FF
311: 0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, // 0x3000
312: 0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, // 0x3080
313: };
314:
315: #define IS_DECOMPOSABLE(table,unicodeVal) (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
316:
317:
318: /*
319: * Decomposer
320: *
321: * Composed Unicode characters are forbidden on
322: * HFS Plus volumes. Decomposer will convert a
323: * composed character into its correct decomposed
324: * sequence.
325: *
326: * Currently only MacRoman and MacJapanese chars
327: * are handled. Other composed characters are
328: * passed unchanged.
329: */
330: static UTF16
331: Decomposer(register UTF16 srcChar, UTF16 *cmbChar)
332: {
333: UTF16 dstChar;
334:
335: *cmbChar = 0;
336:
337: if ((srcChar <= 0x00FF) && (srcChar >= 0x00C0)) {
338: srcChar -= 0x00C0;
339:
340: dstChar = (UTF16) primary_char[srcChar];
341:
342: if (dstChar <= 'z') {
343: *cmbChar = (UTF16) 0x0300 + (UTF16) combining_char[srcChar];
344: }
345: } else if ((srcChar > 0x3000) && (srcChar < 0x3100) &&
346: IS_DECOMPOSABLE(__CJKDecompBitmap, srcChar - 0x3000)) {
347: switch(srcChar) {
348: case 0x3071: dstChar = 0x306F; *cmbChar = 0x309A; break; // HIRAGANA LETTER PA
349: case 0x3074: dstChar = 0x3072; *cmbChar = 0x309A; break; // HIRAGANA LETTER PI
350: case 0x3077: dstChar = 0x3075; *cmbChar = 0x309A; break; // HIRAGANA LETTER PU
351: case 0x307A: dstChar = 0x3078; *cmbChar = 0x309A; break; // HIRAGANA LETTER PE
352:
353: case 0x307D: dstChar = 0x307B; *cmbChar = 0x309A; break; // HIRAGANA LETTER PO
354: case 0x3094: dstChar = 0x3046; *cmbChar = 0x3099; break; // HIRAGANA LETTER VU
355: case 0x30D1: dstChar = 0x30CF; *cmbChar = 0x309A; break; // KATAKANA LETTER PA
356: case 0x30D4: dstChar = 0x30D2; *cmbChar = 0x309A; break; // KATAKANA LETTER PI
357:
358: case 0x30D7: dstChar = 0x30D5; *cmbChar = 0x309A; break; // KATAKANA LETTER PU
359: case 0x30DA: dstChar = 0x30D8; *cmbChar = 0x309A; break; // KATAKANA LETTER PE
360: case 0x30DD: dstChar = 0x30DB; *cmbChar = 0x309A; break; // KATAKANA LETTER PO
361: case 0x30F4: dstChar = 0x30A6; *cmbChar = 0x3099; break; // KATAKANA LETTER VU
362:
363: case 0x30F7: dstChar = 0x30EF; *cmbChar = 0x3099; break; // KATAKANA LETTER VA
364: case 0x30F8: dstChar = 0x30F0; *cmbChar = 0x3099; break; // KATAKANA LETTER VI
365: case 0x30F9: dstChar = 0x30F1; *cmbChar = 0x3099; break; // KATAKANA LETTER VE
366: case 0x30FA: dstChar = 0x30F2; *cmbChar = 0x3099; break; // KATAKANA LETTER VO
367:
368: default:
369: /* the rest (41 of them) have a simple conversion */
370: dstChar = srcChar - 1;
371: *cmbChar = 0x3099;
372: };
373: } else {
374: dstChar = srcChar;
375: }
376:
377: return dstChar;
378: }
379:
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.