XNU/bsd/hfs/hfscommon/Unicode/ConvertUTF.c - annotate

Return to ConvertUTF.c CVS log
Up to [Apple XNU] / XNU / bsd / hfs / hfscommon / Unicode
Annotation of XNU/bsd/hfs/hfscommon/Unicode/ConvertUTF.c, revision 1.1.1.1

1.1       root        1: /*
                      2:  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
                      3:  *
                      4:  * @APPLE_LICENSE_HEADER_START@
                      5:  * 
                      6:  * The contents of this file constitute Original Code as defined in and
                      7:  * are subject to the Apple Public Source License Version 1.1 (the
                      8:  * "License").  You may not use this file except in compliance with the
                      9:  * License.  Please obtain a copy of the License at
                     10:  * http://www.apple.com/publicsource and read it before using this file.
                     11:  * 
                     12:  * This Original Code and all software distributed under the License are
                     13:  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
                     14:  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
                     15:  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
                     16:  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
                     17:  * License for the specific language governing rights and limitations
                     18:  * under the License.
                     19:  * 
                     20:  * @APPLE_LICENSE_HEADER_END@
                     21:  */
                     22: /* ================================================================ */
                     23: /*
                     24: File:  ConvertUTF.c
                     25: Author: Mark E. Davis
                     26: Copyright (C) 1994 Taligent, Inc. All rights reserved.
                     27: 
                     28: This code is copyrighted. Under the copyright laws, this code may not
                     29: be copied, in whole or part, without prior written consent of Taligent. 
                     30: 
                     31: Taligent grants the right to use or reprint this code as long as this
                     32: ENTIRE copyright notice is reproduced in the code or reproduction.
                     33: The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES,
                     34: EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED
                     35: WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  IN
                     36: NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING,
                     37: WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS
                     38: INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
                     39: LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN
                     40: IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
                     41: BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
                     42: LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE
                     43: LIMITATION MAY NOT APPLY TO YOU.
                     44: 
                     45: RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the
                     46: government is subject to restrictions as set forth in subparagraph
                     47: (c)(l)(ii) of the Rights in Technical Data and Computer Software
                     48: clause at DFARS 252.227-7013 and FAR 52.227-19.
                     49: 
                     50: This code may be protected by one or more U.S. and International
                     51: Patents.
                     52: 
                     53: TRADEMARKS: Taligent and the Taligent Design Mark are registered
                     54: trademarks of Taligent, Inc.
                     55: 
                     56: HISTORY:
                     57: 
                     58:        22-Jan-1999     Don Brady               Add decomposition to ConvertUTF8toUTF16.
                     59:        17-Nov-1998     Don Brady               Add ":" to "/" conversions.
                     60: */
                     61: /* ================================================================ */
                     62: 
                     63: #include "ConvertUTF.h"
                     64: 
                     65: /* ================================================================ */
                     66: 
                     67: const int halfShift                            = 10;
                     68: const UCS4 halfBase                            = 0x0010000UL;
                     69: const UCS4 halfMask                            = 0x3FFUL;
                     70: const UCS4 kSurrogateHighStart = 0xD800UL;
                     71: const UCS4 kSurrogateHighEnd   = 0xDBFFUL;
                     72: const UCS4 kSurrogateLowStart  = 0xDC00UL;
                     73: const UCS4 kSurrogateLowEnd            = 0xDFFFUL;
                     74: 
                     75: const UCS4 kReplacementCharacter =     0x0000FFFDUL;
                     76: const UCS4 kMaximumUCS2 =                      0x0000FFFFUL;
                     77: const UCS4 kMaximumUTF16 =                     0x0010FFFFUL;
                     78: const UCS4 kMaximumUCS4 =                      0x7FFFFFFFUL;
                     79: 
                     80: /* ================================================================ */
                     81: 
                     82: UCS4 offsetsFromUTF8[6] =      {0x00000000UL, 0x00003080UL, 0x000E2080UL, 
                     83:                                                         0x03C82080UL, 0xFA082080UL, 0x82082080UL};
                     84: char bytesFromUTF8[256] = {
                     85:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                     86:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                     87:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                     88:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                     89:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                     90:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                     91:        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
                     92:        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};
                     93: 
                     94: UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
                     95: 
                     96: static UTF16 Decomposer(register UTF16 srcChar, UTF16 *cmbChar);
                     97: 
                     98: /* ================================================================ */
                     99: /*     This code is similar in effect to making successive calls on the
                    100: * mbtowc and wctomb routines in FSS-UTF. However, it is considerably
                    101: * different in code:
                    102: * it is adapted to be consistent with UTF16,
                    103: * the interface converts a whole buffer to avoid function-call overhead
                    104: * constants have been gathered.
                    105: * loops & conditionals have been removed as much as possible for
                    106: * efficiency, in favor of drop-through switch statements.
                    107: */
                    108: 
                    109: /*
                    110:  * Colons vs. Slash
                    111:  *
                    112:  * The VFS layer uses a "/" as a pathname separator but HFS disks
                    113:  * use a ":".  So when converting from UTF-8, ":" characters need
                    114:  * to be changed to "/" so that colons don't end up on HFS disks.
                    115:  * Likewise when converting into UTF-8, "/" characters need to be
                    116:  * changed to ":" so that a "/" in a filename is not returned 
                    117:  * through the VFS layer.
                    118:  *
                    119:  * We do not need to worry about full-width slash or colons since
                    120:  * their respective representations outside of Unicode are never
                    121:  * the 7-bit versions (0x2f or 0x3a).
                    122:  */
                    123: 
                    124: 
                    125: /* ================================================================ */
                    126: ConversionResult       ConvertUTF16toUTF8 (
                    127:                UTF16** sourceStart, const UTF16* sourceEnd, 
                    128:                UTF8** targetStart, const UTF8* targetEnd)
                    129: {
                    130:        ConversionResult result = ok;
                    131:        register UTF16* source = *sourceStart;
                    132:        register UTF8* target = *targetStart;
                    133:        while (source < sourceEnd) {
                    134:                register UCS4 ch;
                    135:                register unsigned short bytesToWrite;
                    136:                register const UCS4 byteMask = 0xBF;
                    137:                register const UCS4 byteMark = 0x80; 
                    138:                register const UCS4 slash = '/'; 
                    139: 
                    140:                ch = *source++;
                    141: 
                    142:                /* optimize for ASCII case... */
                    143:                if (ch < 0x80) {
                    144:                        if (ch == slash)
                    145:                        ch = ':';       /* VFS doesn't like slash */
                    146: 
                    147:                        if (target >= targetEnd) {
                    148:                                result = targetExhausted;
                    149:                                break;
                    150:                        }
                    151:                        if (ch == 0) {
                    152:                                continue;       /* skip over embedded NULLs */
                    153:                        }
                    154: 
                    155:                        *target++ = ch;
                    156:                        continue;
                    157:                } else if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
                    158:                                && source < sourceEnd) {
                    159:                        register UCS4 ch2 = *source;
                    160:                        if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
                    161:                                ch = ((ch - kSurrogateHighStart) << halfShift)
                    162:                                        + (ch2 - kSurrogateLowStart) + halfBase;
                    163:                                ++source;
                    164:                        };
                    165:                };
                    166: 
                    167:                if (ch < 0x80) {                                bytesToWrite = 1;
                    168:                } else if (ch < 0x800) {                bytesToWrite = 2;
                    169:                } else if (ch < 0x10000) {              bytesToWrite = 3;
                    170:                } else if (ch < 0x200000) {             bytesToWrite = 4;
                    171:                } else if (ch < 0x4000000) {    bytesToWrite = 5;
                    172:                } else if (ch <= kMaximumUCS4){ bytesToWrite = 6;
                    173:                } else {                                                bytesToWrite = 2;
                    174:                                                                                ch = kReplacementCharacter;
                    175:                }; /* I wish there were a smart way to avoid this conditional */
                    176:                
                    177:                target += bytesToWrite;
                    178:                if (target > targetEnd) {
                    179:                        target -= bytesToWrite; result = targetExhausted; break;
                    180:                };
                    181:                switch (bytesToWrite) { /* note: code falls through cases! */
                    182:                        case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6;
                    183:                        case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6;
                    184:                        case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
                    185:                        case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
                    186:                        case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
                    187:                        case 1: *--target =  ch | firstByteMark[bytesToWrite];
                    188:                };
                    189:                target += bytesToWrite;
                    190:        };
                    191:        *sourceStart = source;
                    192:        *targetStart = target;
                    193:        return result;
                    194: };
                    195: 
                    196: /* ================================================================ */
                    197: 
                    198: ConversionResult       ConvertUTF8toUTF16 (
                    199:                UTF8** sourceStart, UTF8* sourceEnd, 
                    200:                UTF16** targetStart, const UTF16* targetEnd)
                    201: {
                    202:        ConversionResult result = ok;
                    203:        register UTF8* source = *sourceStart;
                    204:        register UTF16* target = *targetStart;
                    205: 
                    206:        while (source < sourceEnd) {
                    207:                UTF8 byte;
                    208:                register UCS4 ch;
                    209:                register unsigned short extraBytesToWrite;
                    210: 
                    211:                /* optimize for ASCII case...*/
                    212:                byte = *source;
                    213:                if (byte < 128) {
                    214:                        if (byte == ':')
                    215:                                byte = '/';     /* HFS doesn't like colons */
                    216:                        source++;
                    217:                        *target++ = byte;
                    218:                        continue;
                    219:                }
                    220: 
                    221:                extraBytesToWrite = bytesFromUTF8[*source];
                    222:                if (source + extraBytesToWrite > sourceEnd) {
                    223:                        result = sourceExhausted; break;
                    224:                };
                    225:                ch = 0;
                    226:                switch(extraBytesToWrite) {     /* note: code falls through cases! */
                    227:                        case 5: ch += *source++; ch <<= 6;
                    228:                        case 4: ch += *source++; ch <<= 6;
                    229:                        case 3: ch += *source++; ch <<= 6;
                    230:                        case 2: ch += *source++; ch <<= 6;
                    231:                        case 1: ch += *source++; ch <<= 6;
                    232:                        case 0: ch += *source++;
                    233:                };
                    234:                ch -= offsetsFromUTF8[extraBytesToWrite];
                    235: 
                    236:                if (target >= targetEnd) {
                    237:                        result = targetExhausted; break;
                    238:                };
                    239:                if (ch <= kMaximumUCS2) {
                    240:                        UTF16 combine;
                    241: 
                    242:                        *target++ = Decomposer(ch, &combine);
                    243: 
                    244:                        if (combine) {
                    245:                                if (target >= targetEnd) {
                    246:                                        result = targetExhausted; break;
                    247:                                };
                    248:                                *target++ = combine;
                    249:                        }
                    250:                } else if (ch > kMaximumUTF16) {
                    251:                        *target++ = kReplacementCharacter;
                    252:                } else {
                    253:                        if (target + 1 >= targetEnd) {
                    254:                                result = targetExhausted; break;
                    255:                        };
                    256:                        ch -= halfBase;
                    257:                        *target++ = (ch >> halfShift) + kSurrogateHighStart;
                    258:                        *target++ = (ch & halfMask) + kSurrogateLowStart;
                    259:                };
                    260:        };
                    261:        *sourceStart = source;
                    262:        *targetStart = target;
                    263:        return result;
                    264: };
                    265: 
                    266: /*
                    267:  * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
                    268:  * primary_char yields first decomposed char. If this
                    269:  * char is an alpha char then get the combining char
                    270:  * from the combining_char table and add 0x0300 to it.
                    271:  */
                    272: 
                    273: static unsigned char primary_char[64] = {
                    274:        0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43,
                    275: 
                    276:        0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49,
                    277: 
                    278:        0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7,
                    279: 
                    280:        0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF,
                    281: 
                    282:        0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63,
                    283: 
                    284:        0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69,
                    285: 
                    286:        0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7,
                    287: 
                    288:        0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79,
                    289: };
                    290: 
                    291: static unsigned char combining_char[64] = {
                    292:        0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
                    293: 
                    294:        0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
                    295: 
                    296:        0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
                    297: 
                    298:        0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF,
                    299: 
                    300:        0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
                    301: 
                    302:        0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
                    303: 
                    304:        0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
                    305: 
                    306:        0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08
                    307: };
                    308: 
                    309: 
                    310: static const unsigned long __CJKDecompBitmap[] = {     // 0x3000 ~ 0x30FF
                    311:     0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C,    // 0x3000
                    312:     0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2,    // 0x3080
                    313: };
                    314: 
                    315: #define IS_DECOMPOSABLE(table,unicodeVal) (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
                    316: 
                    317: 
                    318: /*
                    319:  * Decomposer
                    320:  *
                    321:  * Composed Unicode characters are forbidden on
                    322:  * HFS Plus volumes. Decomposer will convert a
                    323:  * composed character into its correct decomposed
                    324:  * sequence.
                    325:  *
                    326:  * Currently only MacRoman and MacJapanese chars
                    327:  * are handled.  Other composed characters are
                    328:  * passed unchanged.
                    329:  */
                    330: static UTF16
                    331: Decomposer(register UTF16 srcChar, UTF16 *cmbChar)
                    332: {
                    333:        UTF16 dstChar;
                    334:        
                    335:        *cmbChar = 0;
                    336: 
                    337:        if ((srcChar <= 0x00FF) && (srcChar >= 0x00C0)) {
                    338:                srcChar -= 0x00C0;
                    339:                
                    340:                dstChar = (UTF16) primary_char[srcChar];
                    341: 
                    342:                if (dstChar <= 'z') {
                    343:                        *cmbChar = (UTF16) 0x0300 + (UTF16) combining_char[srcChar];
                    344:                }
                    345:        } else if ((srcChar > 0x3000) && (srcChar < 0x3100) &&
                    346:                                IS_DECOMPOSABLE(__CJKDecompBitmap, srcChar - 0x3000)) {
                    347:                switch(srcChar) {
                    348:                case 0x3071: dstChar = 0x306F; *cmbChar = 0x309A; break;        // HIRAGANA LETTER PA
                    349:                case 0x3074: dstChar = 0x3072; *cmbChar = 0x309A; break;        // HIRAGANA LETTER PI
                    350:                case 0x3077: dstChar = 0x3075; *cmbChar = 0x309A; break;        // HIRAGANA LETTER PU
                    351:                case 0x307A: dstChar = 0x3078; *cmbChar = 0x309A; break;        // HIRAGANA LETTER PE
                    352: 
                    353:                case 0x307D: dstChar = 0x307B; *cmbChar = 0x309A; break;        // HIRAGANA LETTER PO
                    354:                case 0x3094: dstChar = 0x3046; *cmbChar = 0x3099; break;        // HIRAGANA LETTER VU
                    355:                case 0x30D1: dstChar = 0x30CF; *cmbChar = 0x309A; break;        // KATAKANA LETTER PA
                    356:                case 0x30D4: dstChar = 0x30D2; *cmbChar = 0x309A; break;        // KATAKANA LETTER PI
                    357: 
                    358:                case 0x30D7: dstChar = 0x30D5; *cmbChar = 0x309A; break;        // KATAKANA LETTER PU
                    359:                case 0x30DA: dstChar = 0x30D8; *cmbChar = 0x309A; break;        // KATAKANA LETTER PE
                    360:                case 0x30DD: dstChar = 0x30DB; *cmbChar = 0x309A; break;        // KATAKANA LETTER PO
                    361:                case 0x30F4: dstChar = 0x30A6; *cmbChar = 0x3099; break;        // KATAKANA LETTER VU
                    362: 
                    363:                case 0x30F7: dstChar = 0x30EF; *cmbChar = 0x3099; break;        // KATAKANA LETTER VA
                    364:                case 0x30F8: dstChar = 0x30F0; *cmbChar = 0x3099; break;        // KATAKANA LETTER VI
                    365:                case 0x30F9: dstChar = 0x30F1; *cmbChar = 0x3099; break;        // KATAKANA LETTER VE
                    366:                case 0x30FA: dstChar = 0x30F2; *cmbChar = 0x3099; break;        // KATAKANA LETTER VO
                    367:                
                    368:                default:
                    369:                        /* the rest (41 of them) have a simple conversion */
                    370:                        dstChar = srcChar - 1;
                    371:                        *cmbChar = 0x3099;
                    372:                };
                    373:        } else {
                    374:                dstChar = srcChar;
                    375:        }
                    376:        
                    377:        return dstChar;
                    378: }
                    379:
unix.superglobalmegacorp.com
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.