|
|
1.1 ! root 1: /* ! 2: * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. ! 3: * ! 4: * @APPLE_LICENSE_HEADER_START@ ! 5: * ! 6: * The contents of this file constitute Original Code as defined in and ! 7: * are subject to the Apple Public Source License Version 1.1 (the ! 8: * "License"). You may not use this file except in compliance with the ! 9: * License. Please obtain a copy of the License at ! 10: * http://www.apple.com/publicsource and read it before using this file. ! 11: * ! 12: * This Original Code and all software distributed under the License are ! 13: * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER ! 14: * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, ! 15: * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, ! 16: * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the ! 17: * License for the specific language governing rights and limitations ! 18: * under the License. ! 19: * ! 20: * @APPLE_LICENSE_HEADER_END@ ! 21: */ ! 22: /* ================================================================ */ ! 23: /* ! 24: File: ConvertUTF.c ! 25: Author: Mark E. Davis ! 26: Copyright (C) 1994 Taligent, Inc. All rights reserved. ! 27: ! 28: This code is copyrighted. Under the copyright laws, this code may not ! 29: be copied, in whole or part, without prior written consent of Taligent. ! 30: ! 31: Taligent grants the right to use or reprint this code as long as this ! 32: ENTIRE copyright notice is reproduced in the code or reproduction. ! 33: The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, ! 34: EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED ! 35: WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN ! 36: NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, ! 37: WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS ! 38: INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY ! 39: LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN ! 40: IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. ! 41: BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF ! 42: LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE ! 43: LIMITATION MAY NOT APPLY TO YOU. ! 44: ! 45: RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the ! 46: government is subject to restrictions as set forth in subparagraph ! 47: (c)(l)(ii) of the Rights in Technical Data and Computer Software ! 48: clause at DFARS 252.227-7013 and FAR 52.227-19. ! 49: ! 50: This code may be protected by one or more U.S. and International ! 51: Patents. ! 52: ! 53: TRADEMARKS: Taligent and the Taligent Design Mark are registered ! 54: trademarks of Taligent, Inc. ! 55: ! 56: HISTORY: ! 57: ! 58: 22-Jan-1999 Don Brady Add decomposition to ConvertUTF8toUTF16. ! 59: 17-Nov-1998 Don Brady Add ":" to "/" conversions. ! 60: */ ! 61: /* ================================================================ */ ! 62: ! 63: #include "ConvertUTF.h" ! 64: ! 65: /* ================================================================ */ ! 66: ! 67: const int halfShift = 10; ! 68: const UCS4 halfBase = 0x0010000UL; ! 69: const UCS4 halfMask = 0x3FFUL; ! 70: const UCS4 kSurrogateHighStart = 0xD800UL; ! 71: const UCS4 kSurrogateHighEnd = 0xDBFFUL; ! 72: const UCS4 kSurrogateLowStart = 0xDC00UL; ! 73: const UCS4 kSurrogateLowEnd = 0xDFFFUL; ! 74: ! 75: const UCS4 kReplacementCharacter = 0x0000FFFDUL; ! 76: const UCS4 kMaximumUCS2 = 0x0000FFFFUL; ! 77: const UCS4 kMaximumUTF16 = 0x0010FFFFUL; ! 78: const UCS4 kMaximumUCS4 = 0x7FFFFFFFUL; ! 79: ! 80: /* ================================================================ */ ! 81: ! 82: UCS4 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL, ! 83: 0x03C82080UL, 0xFA082080UL, 0x82082080UL}; ! 84: char bytesFromUTF8[256] = { ! 85: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, ! 86: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, ! 87: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, ! 88: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, ! 89: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, ! 90: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, ! 91: 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, ! 92: 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5}; ! 93: ! 94: UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; ! 95: ! 96: static UTF16 Decomposer(register UTF16 srcChar, UTF16 *cmbChar); ! 97: ! 98: /* ================================================================ */ ! 99: /* This code is similar in effect to making successive calls on the ! 100: * mbtowc and wctomb routines in FSS-UTF. However, it is considerably ! 101: * different in code: ! 102: * it is adapted to be consistent with UTF16, ! 103: * the interface converts a whole buffer to avoid function-call overhead ! 104: * constants have been gathered. ! 105: * loops & conditionals have been removed as much as possible for ! 106: * efficiency, in favor of drop-through switch statements. ! 107: */ ! 108: ! 109: /* ! 110: * Colons vs. Slash ! 111: * ! 112: * The VFS layer uses a "/" as a pathname separator but HFS disks ! 113: * use a ":". So when converting from UTF-8, ":" characters need ! 114: * to be changed to "/" so that colons don't end up on HFS disks. ! 115: * Likewise when converting into UTF-8, "/" characters need to be ! 116: * changed to ":" so that a "/" in a filename is not returned ! 117: * through the VFS layer. ! 118: * ! 119: * We do not need to worry about full-width slash or colons since ! 120: * their respective representations outside of Unicode are never ! 121: * the 7-bit versions (0x2f or 0x3a). ! 122: */ ! 123: ! 124: ! 125: /* ================================================================ */ ! 126: ConversionResult ConvertUTF16toUTF8 ( ! 127: UTF16** sourceStart, const UTF16* sourceEnd, ! 128: UTF8** targetStart, const UTF8* targetEnd) ! 129: { ! 130: ConversionResult result = ok; ! 131: register UTF16* source = *sourceStart; ! 132: register UTF8* target = *targetStart; ! 133: while (source < sourceEnd) { ! 134: register UCS4 ch; ! 135: register unsigned short bytesToWrite; ! 136: register const UCS4 byteMask = 0xBF; ! 137: register const UCS4 byteMark = 0x80; ! 138: register const UCS4 slash = '/'; ! 139: ! 140: ch = *source++; ! 141: ! 142: /* optimize for ASCII case... */ ! 143: if (ch < 0x80) { ! 144: if (ch == slash) ! 145: ch = ':'; /* VFS doesn't like slash */ ! 146: ! 147: if (target >= targetEnd) { ! 148: result = targetExhausted; ! 149: break; ! 150: } ! 151: if (ch == 0) { ! 152: continue; /* skip over embedded NULLs */ ! 153: } ! 154: ! 155: *target++ = ch; ! 156: continue; ! 157: } else if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd ! 158: && source < sourceEnd) { ! 159: register UCS4 ch2 = *source; ! 160: if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { ! 161: ch = ((ch - kSurrogateHighStart) << halfShift) ! 162: + (ch2 - kSurrogateLowStart) + halfBase; ! 163: ++source; ! 164: }; ! 165: }; ! 166: ! 167: if (ch < 0x80) { bytesToWrite = 1; ! 168: } else if (ch < 0x800) { bytesToWrite = 2; ! 169: } else if (ch < 0x10000) { bytesToWrite = 3; ! 170: } else if (ch < 0x200000) { bytesToWrite = 4; ! 171: } else if (ch < 0x4000000) { bytesToWrite = 5; ! 172: } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; ! 173: } else { bytesToWrite = 2; ! 174: ch = kReplacementCharacter; ! 175: }; /* I wish there were a smart way to avoid this conditional */ ! 176: ! 177: target += bytesToWrite; ! 178: if (target > targetEnd) { ! 179: target -= bytesToWrite; result = targetExhausted; break; ! 180: }; ! 181: switch (bytesToWrite) { /* note: code falls through cases! */ ! 182: case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6; ! 183: case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6; ! 184: case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; ! 185: case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; ! 186: case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; ! 187: case 1: *--target = ch | firstByteMark[bytesToWrite]; ! 188: }; ! 189: target += bytesToWrite; ! 190: }; ! 191: *sourceStart = source; ! 192: *targetStart = target; ! 193: return result; ! 194: }; ! 195: ! 196: /* ================================================================ */ ! 197: ! 198: ConversionResult ConvertUTF8toUTF16 ( ! 199: UTF8** sourceStart, UTF8* sourceEnd, ! 200: UTF16** targetStart, const UTF16* targetEnd) ! 201: { ! 202: ConversionResult result = ok; ! 203: register UTF8* source = *sourceStart; ! 204: register UTF16* target = *targetStart; ! 205: ! 206: while (source < sourceEnd) { ! 207: UTF8 byte; ! 208: register UCS4 ch; ! 209: register unsigned short extraBytesToWrite; ! 210: ! 211: /* optimize for ASCII case...*/ ! 212: byte = *source; ! 213: if (byte < 128) { ! 214: if (byte == ':') ! 215: byte = '/'; /* HFS doesn't like colons */ ! 216: source++; ! 217: *target++ = byte; ! 218: continue; ! 219: } ! 220: ! 221: extraBytesToWrite = bytesFromUTF8[*source]; ! 222: if (source + extraBytesToWrite > sourceEnd) { ! 223: result = sourceExhausted; break; ! 224: }; ! 225: ch = 0; ! 226: switch(extraBytesToWrite) { /* note: code falls through cases! */ ! 227: case 5: ch += *source++; ch <<= 6; ! 228: case 4: ch += *source++; ch <<= 6; ! 229: case 3: ch += *source++; ch <<= 6; ! 230: case 2: ch += *source++; ch <<= 6; ! 231: case 1: ch += *source++; ch <<= 6; ! 232: case 0: ch += *source++; ! 233: }; ! 234: ch -= offsetsFromUTF8[extraBytesToWrite]; ! 235: ! 236: if (target >= targetEnd) { ! 237: result = targetExhausted; break; ! 238: }; ! 239: if (ch <= kMaximumUCS2) { ! 240: UTF16 combine; ! 241: ! 242: *target++ = Decomposer(ch, &combine); ! 243: ! 244: if (combine) { ! 245: if (target >= targetEnd) { ! 246: result = targetExhausted; break; ! 247: }; ! 248: *target++ = combine; ! 249: } ! 250: } else if (ch > kMaximumUTF16) { ! 251: *target++ = kReplacementCharacter; ! 252: } else { ! 253: if (target + 1 >= targetEnd) { ! 254: result = targetExhausted; break; ! 255: }; ! 256: ch -= halfBase; ! 257: *target++ = (ch >> halfShift) + kSurrogateHighStart; ! 258: *target++ = (ch & halfMask) + kSurrogateLowStart; ! 259: }; ! 260: }; ! 261: *sourceStart = source; ! 262: *targetStart = target; ! 263: return result; ! 264: }; ! 265: ! 266: /* ! 267: * Lookup tables for Unicode chars 0x00C0 thru 0x00FF ! 268: * primary_char yields first decomposed char. If this ! 269: * char is an alpha char then get the combining char ! 270: * from the combining_char table and add 0x0300 to it. ! 271: */ ! 272: ! 273: static unsigned char primary_char[64] = { ! 274: 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43, ! 275: ! 276: 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, ! 277: ! 278: 0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7, ! 279: ! 280: 0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF, ! 281: ! 282: 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63, ! 283: ! 284: 0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69, ! 285: ! 286: 0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7, ! 287: ! 288: 0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79, ! 289: }; ! 290: ! 291: static unsigned char combining_char[64] = { ! 292: 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27, ! 293: ! 294: 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, ! 295: ! 296: 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF, ! 297: ! 298: 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF, ! 299: ! 300: 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27, ! 301: ! 302: 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, ! 303: ! 304: 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF, ! 305: ! 306: 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08 ! 307: }; ! 308: ! 309: ! 310: static const unsigned long __CJKDecompBitmap[] = { // 0x3000 ~ 0x30FF ! 311: 0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, // 0x3000 ! 312: 0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, // 0x3080 ! 313: }; ! 314: ! 315: #define IS_DECOMPOSABLE(table,unicodeVal) (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32)))) ! 316: ! 317: ! 318: /* ! 319: * Decomposer ! 320: * ! 321: * Composed Unicode characters are forbidden on ! 322: * HFS Plus volumes. Decomposer will convert a ! 323: * composed character into its correct decomposed ! 324: * sequence. ! 325: * ! 326: * Currently only MacRoman and MacJapanese chars ! 327: * are handled. Other composed characters are ! 328: * passed unchanged. ! 329: */ ! 330: static UTF16 ! 331: Decomposer(register UTF16 srcChar, UTF16 *cmbChar) ! 332: { ! 333: UTF16 dstChar; ! 334: ! 335: *cmbChar = 0; ! 336: ! 337: if ((srcChar <= 0x00FF) && (srcChar >= 0x00C0)) { ! 338: srcChar -= 0x00C0; ! 339: ! 340: dstChar = (UTF16) primary_char[srcChar]; ! 341: ! 342: if (dstChar <= 'z') { ! 343: *cmbChar = (UTF16) 0x0300 + (UTF16) combining_char[srcChar]; ! 344: } ! 345: } else if ((srcChar > 0x3000) && (srcChar < 0x3100) && ! 346: IS_DECOMPOSABLE(__CJKDecompBitmap, srcChar - 0x3000)) { ! 347: switch(srcChar) { ! 348: case 0x3071: dstChar = 0x306F; *cmbChar = 0x309A; break; // HIRAGANA LETTER PA ! 349: case 0x3074: dstChar = 0x3072; *cmbChar = 0x309A; break; // HIRAGANA LETTER PI ! 350: case 0x3077: dstChar = 0x3075; *cmbChar = 0x309A; break; // HIRAGANA LETTER PU ! 351: case 0x307A: dstChar = 0x3078; *cmbChar = 0x309A; break; // HIRAGANA LETTER PE ! 352: ! 353: case 0x307D: dstChar = 0x307B; *cmbChar = 0x309A; break; // HIRAGANA LETTER PO ! 354: case 0x3094: dstChar = 0x3046; *cmbChar = 0x3099; break; // HIRAGANA LETTER VU ! 355: case 0x30D1: dstChar = 0x30CF; *cmbChar = 0x309A; break; // KATAKANA LETTER PA ! 356: case 0x30D4: dstChar = 0x30D2; *cmbChar = 0x309A; break; // KATAKANA LETTER PI ! 357: ! 358: case 0x30D7: dstChar = 0x30D5; *cmbChar = 0x309A; break; // KATAKANA LETTER PU ! 359: case 0x30DA: dstChar = 0x30D8; *cmbChar = 0x309A; break; // KATAKANA LETTER PE ! 360: case 0x30DD: dstChar = 0x30DB; *cmbChar = 0x309A; break; // KATAKANA LETTER PO ! 361: case 0x30F4: dstChar = 0x30A6; *cmbChar = 0x3099; break; // KATAKANA LETTER VU ! 362: ! 363: case 0x30F7: dstChar = 0x30EF; *cmbChar = 0x3099; break; // KATAKANA LETTER VA ! 364: case 0x30F8: dstChar = 0x30F0; *cmbChar = 0x3099; break; // KATAKANA LETTER VI ! 365: case 0x30F9: dstChar = 0x30F1; *cmbChar = 0x3099; break; // KATAKANA LETTER VE ! 366: case 0x30FA: dstChar = 0x30F2; *cmbChar = 0x3099; break; // KATAKANA LETTER VO ! 367: ! 368: default: ! 369: /* the rest (41 of them) have a simple conversion */ ! 370: dstChar = srcChar - 1; ! 371: *cmbChar = 0x3099; ! 372: }; ! 373: } else { ! 374: dstChar = srcChar; ! 375: } ! 376: ! 377: return dstChar; ! 378: } ! 379:
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.