Annotation of XNU/bsd/hfs/hfscommon/Unicode/ConvertUTF.c, revision 1.1

1.1     ! root        1: /*
        !             2:  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
        !             3:  *
        !             4:  * @APPLE_LICENSE_HEADER_START@
        !             5:  * 
        !             6:  * The contents of this file constitute Original Code as defined in and
        !             7:  * are subject to the Apple Public Source License Version 1.1 (the
        !             8:  * "License").  You may not use this file except in compliance with the
        !             9:  * License.  Please obtain a copy of the License at
        !            10:  * http://www.apple.com/publicsource and read it before using this file.
        !            11:  * 
        !            12:  * This Original Code and all software distributed under the License are
        !            13:  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
        !            14:  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
        !            15:  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
        !            16:  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
        !            17:  * License for the specific language governing rights and limitations
        !            18:  * under the License.
        !            19:  * 
        !            20:  * @APPLE_LICENSE_HEADER_END@
        !            21:  */
        !            22: /* ================================================================ */
        !            23: /*
        !            24: File:  ConvertUTF.c
        !            25: Author: Mark E. Davis
        !            26: Copyright (C) 1994 Taligent, Inc. All rights reserved.
        !            27: 
        !            28: This code is copyrighted. Under the copyright laws, this code may not
        !            29: be copied, in whole or part, without prior written consent of Taligent. 
        !            30: 
        !            31: Taligent grants the right to use or reprint this code as long as this
        !            32: ENTIRE copyright notice is reproduced in the code or reproduction.
        !            33: The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES,
        !            34: EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED
        !            35: WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  IN
        !            36: NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING,
        !            37: WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS
        !            38: INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
        !            39: LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN
        !            40: IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
        !            41: BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
        !            42: LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE
        !            43: LIMITATION MAY NOT APPLY TO YOU.
        !            44: 
        !            45: RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the
        !            46: government is subject to restrictions as set forth in subparagraph
        !            47: (c)(l)(ii) of the Rights in Technical Data and Computer Software
        !            48: clause at DFARS 252.227-7013 and FAR 52.227-19.
        !            49: 
        !            50: This code may be protected by one or more U.S. and International
        !            51: Patents.
        !            52: 
        !            53: TRADEMARKS: Taligent and the Taligent Design Mark are registered
        !            54: trademarks of Taligent, Inc.
        !            55: 
        !            56: HISTORY:
        !            57: 
        !            58:        22-Jan-1999     Don Brady               Add decomposition to ConvertUTF8toUTF16.
        !            59:        17-Nov-1998     Don Brady               Add ":" to "/" conversions.
        !            60: */
        !            61: /* ================================================================ */
        !            62: 
        !            63: #include "ConvertUTF.h"
        !            64: 
        !            65: /* ================================================================ */
        !            66: 
        !            67: const int halfShift                            = 10;
        !            68: const UCS4 halfBase                            = 0x0010000UL;
        !            69: const UCS4 halfMask                            = 0x3FFUL;
        !            70: const UCS4 kSurrogateHighStart = 0xD800UL;
        !            71: const UCS4 kSurrogateHighEnd   = 0xDBFFUL;
        !            72: const UCS4 kSurrogateLowStart  = 0xDC00UL;
        !            73: const UCS4 kSurrogateLowEnd            = 0xDFFFUL;
        !            74: 
        !            75: const UCS4 kReplacementCharacter =     0x0000FFFDUL;
        !            76: const UCS4 kMaximumUCS2 =                      0x0000FFFFUL;
        !            77: const UCS4 kMaximumUTF16 =                     0x0010FFFFUL;
        !            78: const UCS4 kMaximumUCS4 =                      0x7FFFFFFFUL;
        !            79: 
        !            80: /* ================================================================ */
        !            81: 
        !            82: UCS4 offsetsFromUTF8[6] =      {0x00000000UL, 0x00003080UL, 0x000E2080UL, 
        !            83:                                                         0x03C82080UL, 0xFA082080UL, 0x82082080UL};
        !            84: char bytesFromUTF8[256] = {
        !            85:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        !            86:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        !            87:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        !            88:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        !            89:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        !            90:        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        !            91:        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        !            92:        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};
        !            93: 
        !            94: UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
        !            95: 
        !            96: static UTF16 Decomposer(register UTF16 srcChar, UTF16 *cmbChar);
        !            97: 
        !            98: /* ================================================================ */
        !            99: /*     This code is similar in effect to making successive calls on the
        !           100: * mbtowc and wctomb routines in FSS-UTF. However, it is considerably
        !           101: * different in code:
        !           102: * it is adapted to be consistent with UTF16,
        !           103: * the interface converts a whole buffer to avoid function-call overhead
        !           104: * constants have been gathered.
        !           105: * loops & conditionals have been removed as much as possible for
        !           106: * efficiency, in favor of drop-through switch statements.
        !           107: */
        !           108: 
        !           109: /*
        !           110:  * Colons vs. Slash
        !           111:  *
        !           112:  * The VFS layer uses a "/" as a pathname separator but HFS disks
        !           113:  * use a ":".  So when converting from UTF-8, ":" characters need
        !           114:  * to be changed to "/" so that colons don't end up on HFS disks.
        !           115:  * Likewise when converting into UTF-8, "/" characters need to be
        !           116:  * changed to ":" so that a "/" in a filename is not returned 
        !           117:  * through the VFS layer.
        !           118:  *
        !           119:  * We do not need to worry about full-width slash or colons since
        !           120:  * their respective representations outside of Unicode are never
        !           121:  * the 7-bit versions (0x2f or 0x3a).
        !           122:  */
        !           123: 
        !           124: 
        !           125: /* ================================================================ */
        !           126: ConversionResult       ConvertUTF16toUTF8 (
        !           127:                UTF16** sourceStart, const UTF16* sourceEnd, 
        !           128:                UTF8** targetStart, const UTF8* targetEnd)
        !           129: {
        !           130:        ConversionResult result = ok;
        !           131:        register UTF16* source = *sourceStart;
        !           132:        register UTF8* target = *targetStart;
        !           133:        while (source < sourceEnd) {
        !           134:                register UCS4 ch;
        !           135:                register unsigned short bytesToWrite;
        !           136:                register const UCS4 byteMask = 0xBF;
        !           137:                register const UCS4 byteMark = 0x80; 
        !           138:                register const UCS4 slash = '/'; 
        !           139: 
        !           140:                ch = *source++;
        !           141: 
        !           142:                /* optimize for ASCII case... */
        !           143:                if (ch < 0x80) {
        !           144:                        if (ch == slash)
        !           145:                        ch = ':';       /* VFS doesn't like slash */
        !           146: 
        !           147:                        if (target >= targetEnd) {
        !           148:                                result = targetExhausted;
        !           149:                                break;
        !           150:                        }
        !           151:                        if (ch == 0) {
        !           152:                                continue;       /* skip over embedded NULLs */
        !           153:                        }
        !           154: 
        !           155:                        *target++ = ch;
        !           156:                        continue;
        !           157:                } else if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
        !           158:                                && source < sourceEnd) {
        !           159:                        register UCS4 ch2 = *source;
        !           160:                        if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
        !           161:                                ch = ((ch - kSurrogateHighStart) << halfShift)
        !           162:                                        + (ch2 - kSurrogateLowStart) + halfBase;
        !           163:                                ++source;
        !           164:                        };
        !           165:                };
        !           166: 
        !           167:                if (ch < 0x80) {                                bytesToWrite = 1;
        !           168:                } else if (ch < 0x800) {                bytesToWrite = 2;
        !           169:                } else if (ch < 0x10000) {              bytesToWrite = 3;
        !           170:                } else if (ch < 0x200000) {             bytesToWrite = 4;
        !           171:                } else if (ch < 0x4000000) {    bytesToWrite = 5;
        !           172:                } else if (ch <= kMaximumUCS4){ bytesToWrite = 6;
        !           173:                } else {                                                bytesToWrite = 2;
        !           174:                                                                                ch = kReplacementCharacter;
        !           175:                }; /* I wish there were a smart way to avoid this conditional */
        !           176:                
        !           177:                target += bytesToWrite;
        !           178:                if (target > targetEnd) {
        !           179:                        target -= bytesToWrite; result = targetExhausted; break;
        !           180:                };
        !           181:                switch (bytesToWrite) { /* note: code falls through cases! */
        !           182:                        case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6;
        !           183:                        case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6;
        !           184:                        case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
        !           185:                        case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
        !           186:                        case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
        !           187:                        case 1: *--target =  ch | firstByteMark[bytesToWrite];
        !           188:                };
        !           189:                target += bytesToWrite;
        !           190:        };
        !           191:        *sourceStart = source;
        !           192:        *targetStart = target;
        !           193:        return result;
        !           194: };
        !           195: 
        !           196: /* ================================================================ */
        !           197: 
        !           198: ConversionResult       ConvertUTF8toUTF16 (
        !           199:                UTF8** sourceStart, UTF8* sourceEnd, 
        !           200:                UTF16** targetStart, const UTF16* targetEnd)
        !           201: {
        !           202:        ConversionResult result = ok;
        !           203:        register UTF8* source = *sourceStart;
        !           204:        register UTF16* target = *targetStart;
        !           205: 
        !           206:        while (source < sourceEnd) {
        !           207:                UTF8 byte;
        !           208:                register UCS4 ch;
        !           209:                register unsigned short extraBytesToWrite;
        !           210: 
        !           211:                /* optimize for ASCII case...*/
        !           212:                byte = *source;
        !           213:                if (byte < 128) {
        !           214:                        if (byte == ':')
        !           215:                                byte = '/';     /* HFS doesn't like colons */
        !           216:                        source++;
        !           217:                        *target++ = byte;
        !           218:                        continue;
        !           219:                }
        !           220: 
        !           221:                extraBytesToWrite = bytesFromUTF8[*source];
        !           222:                if (source + extraBytesToWrite > sourceEnd) {
        !           223:                        result = sourceExhausted; break;
        !           224:                };
        !           225:                ch = 0;
        !           226:                switch(extraBytesToWrite) {     /* note: code falls through cases! */
        !           227:                        case 5: ch += *source++; ch <<= 6;
        !           228:                        case 4: ch += *source++; ch <<= 6;
        !           229:                        case 3: ch += *source++; ch <<= 6;
        !           230:                        case 2: ch += *source++; ch <<= 6;
        !           231:                        case 1: ch += *source++; ch <<= 6;
        !           232:                        case 0: ch += *source++;
        !           233:                };
        !           234:                ch -= offsetsFromUTF8[extraBytesToWrite];
        !           235: 
        !           236:                if (target >= targetEnd) {
        !           237:                        result = targetExhausted; break;
        !           238:                };
        !           239:                if (ch <= kMaximumUCS2) {
        !           240:                        UTF16 combine;
        !           241: 
        !           242:                        *target++ = Decomposer(ch, &combine);
        !           243: 
        !           244:                        if (combine) {
        !           245:                                if (target >= targetEnd) {
        !           246:                                        result = targetExhausted; break;
        !           247:                                };
        !           248:                                *target++ = combine;
        !           249:                        }
        !           250:                } else if (ch > kMaximumUTF16) {
        !           251:                        *target++ = kReplacementCharacter;
        !           252:                } else {
        !           253:                        if (target + 1 >= targetEnd) {
        !           254:                                result = targetExhausted; break;
        !           255:                        };
        !           256:                        ch -= halfBase;
        !           257:                        *target++ = (ch >> halfShift) + kSurrogateHighStart;
        !           258:                        *target++ = (ch & halfMask) + kSurrogateLowStart;
        !           259:                };
        !           260:        };
        !           261:        *sourceStart = source;
        !           262:        *targetStart = target;
        !           263:        return result;
        !           264: };
        !           265: 
        !           266: /*
        !           267:  * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
        !           268:  * primary_char yields first decomposed char. If this
        !           269:  * char is an alpha char then get the combining char
        !           270:  * from the combining_char table and add 0x0300 to it.
        !           271:  */
        !           272: 
        !           273: static unsigned char primary_char[64] = {
        !           274:        0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43,
        !           275: 
        !           276:        0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49,
        !           277: 
        !           278:        0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7,
        !           279: 
        !           280:        0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF,
        !           281: 
        !           282:        0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63,
        !           283: 
        !           284:        0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69,
        !           285: 
        !           286:        0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7,
        !           287: 
        !           288:        0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79,
        !           289: };
        !           290: 
        !           291: static unsigned char combining_char[64] = {
        !           292:        0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
        !           293: 
        !           294:        0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
        !           295: 
        !           296:        0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
        !           297: 
        !           298:        0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF,
        !           299: 
        !           300:        0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
        !           301: 
        !           302:        0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
        !           303: 
        !           304:        0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
        !           305: 
        !           306:        0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08
        !           307: };
        !           308: 
        !           309: 
        !           310: static const unsigned long __CJKDecompBitmap[] = {     // 0x3000 ~ 0x30FF
        !           311:     0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C,    // 0x3000
        !           312:     0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2,    // 0x3080
        !           313: };
        !           314: 
        !           315: #define IS_DECOMPOSABLE(table,unicodeVal) (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
        !           316: 
        !           317: 
        !           318: /*
        !           319:  * Decomposer
        !           320:  *
        !           321:  * Composed Unicode characters are forbidden on
        !           322:  * HFS Plus volumes. Decomposer will convert a
        !           323:  * composed character into its correct decomposed
        !           324:  * sequence.
        !           325:  *
        !           326:  * Currently only MacRoman and MacJapanese chars
        !           327:  * are handled.  Other composed characters are
        !           328:  * passed unchanged.
        !           329:  */
        !           330: static UTF16
        !           331: Decomposer(register UTF16 srcChar, UTF16 *cmbChar)
        !           332: {
        !           333:        UTF16 dstChar;
        !           334:        
        !           335:        *cmbChar = 0;
        !           336: 
        !           337:        if ((srcChar <= 0x00FF) && (srcChar >= 0x00C0)) {
        !           338:                srcChar -= 0x00C0;
        !           339:                
        !           340:                dstChar = (UTF16) primary_char[srcChar];
        !           341: 
        !           342:                if (dstChar <= 'z') {
        !           343:                        *cmbChar = (UTF16) 0x0300 + (UTF16) combining_char[srcChar];
        !           344:                }
        !           345:        } else if ((srcChar > 0x3000) && (srcChar < 0x3100) &&
        !           346:                                IS_DECOMPOSABLE(__CJKDecompBitmap, srcChar - 0x3000)) {
        !           347:                switch(srcChar) {
        !           348:                case 0x3071: dstChar = 0x306F; *cmbChar = 0x309A; break;        // HIRAGANA LETTER PA
        !           349:                case 0x3074: dstChar = 0x3072; *cmbChar = 0x309A; break;        // HIRAGANA LETTER PI
        !           350:                case 0x3077: dstChar = 0x3075; *cmbChar = 0x309A; break;        // HIRAGANA LETTER PU
        !           351:                case 0x307A: dstChar = 0x3078; *cmbChar = 0x309A; break;        // HIRAGANA LETTER PE
        !           352: 
        !           353:                case 0x307D: dstChar = 0x307B; *cmbChar = 0x309A; break;        // HIRAGANA LETTER PO
        !           354:                case 0x3094: dstChar = 0x3046; *cmbChar = 0x3099; break;        // HIRAGANA LETTER VU
        !           355:                case 0x30D1: dstChar = 0x30CF; *cmbChar = 0x309A; break;        // KATAKANA LETTER PA
        !           356:                case 0x30D4: dstChar = 0x30D2; *cmbChar = 0x309A; break;        // KATAKANA LETTER PI
        !           357: 
        !           358:                case 0x30D7: dstChar = 0x30D5; *cmbChar = 0x309A; break;        // KATAKANA LETTER PU
        !           359:                case 0x30DA: dstChar = 0x30D8; *cmbChar = 0x309A; break;        // KATAKANA LETTER PE
        !           360:                case 0x30DD: dstChar = 0x30DB; *cmbChar = 0x309A; break;        // KATAKANA LETTER PO
        !           361:                case 0x30F4: dstChar = 0x30A6; *cmbChar = 0x3099; break;        // KATAKANA LETTER VU
        !           362: 
        !           363:                case 0x30F7: dstChar = 0x30EF; *cmbChar = 0x3099; break;        // KATAKANA LETTER VA
        !           364:                case 0x30F8: dstChar = 0x30F0; *cmbChar = 0x3099; break;        // KATAKANA LETTER VI
        !           365:                case 0x30F9: dstChar = 0x30F1; *cmbChar = 0x3099; break;        // KATAKANA LETTER VE
        !           366:                case 0x30FA: dstChar = 0x30F2; *cmbChar = 0x3099; break;        // KATAKANA LETTER VO
        !           367:                
        !           368:                default:
        !           369:                        /* the rest (41 of them) have a simple conversion */
        !           370:                        dstChar = srcChar - 1;
        !           371:                        *cmbChar = 0x3099;
        !           372:                };
        !           373:        } else {
        !           374:                dstChar = srcChar;
        !           375:        }
        !           376:        
        !           377:        return dstChar;
        !           378: }
        !           379: 

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.