XNU/bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c - view

File: [Apple XNU] / XNU / bsd / hfs / hfscommon / Unicode / UnicodeWrappers.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs
Tue Apr 24 17:44:52 2018 UTC (8 years, 2 months ago) by root
Branches: MAIN, Apple
CVS tags: v68_4-1_1, HEAD

xnu-68.4-1.1

/* * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. * * @APPLE_LICENSE_HEADER_END@ */ /* File: UnicodeWrappers.c Contains: Wrapper routines for Unicode conversion and comparison. Version: HFS Plus 1.0 Written by: Mark Day Copyright: � 1996-1999 by Apple Computer, Inc., all rights reserved. File Ownership: DRI: Mark Day Other Contact: Don Brady Technology: xxx put technology here xxx Writers: (DSH) Deric Horn (msd) Mark Day (djb) Don Brady Change History (most recent first): <MOSXS> 6/10/99 djb Add support for Euro Sign (0x20AC) to MacRoman/Unicode conversions. <MOSXS> 2/09/99 djb Fix UnicodeToMacRoman to handle a terminating decomposed char. <MOSXS> 1/22/99 djb Add more TARGET_OS_MAC conditionals to remove orphaned code. <MOSXS> 7/6/98 djb Handle hi-bit Mac Roman characters in basic latin conversions (radar #2247519). <MOSXS> 6/11/98 PPD Added a few special-case ASCII/Unicode mappings to cover installer's needs. <CS41> 1/28/98 msd Bug 2207446: When mangling a name, check to see if the Unicode Converter is installed before we call it. <CS40> 1/21/98 msd Bug 2206836: If a name contains a colon, change it to question mark and mangle the name. <CS39> 12/11/97 msd For Metrowerks and test tools, call the Get_xxx routines to get the Unicode table addresses. <CS38> 12/10/97 djb Radar #2005461, don't use fallback chars when converting to Unicode, instead let the client (Catalog) retry with MacRoman. <CS37> 12/2/97 DSH Conditionalize out some unicode related routines for DFA <CS36> 11/26/97 djb Radar #2005461,2005688 don't swallow kTECPartialCharErr errors! <CS35> 11/17/97 djb Name mangling was broken with decomposed Unicode. <CS34> 11/16/97 djb Radar #2001928 - use kUnicodeCanonicalDecompVariant variant. <CS33> 11/11/97 DSH Use Get_gLowerCaseTable for DiskFirstAid builds to avoid loading in a branch to the table. <CS32> 11/7/97 msd Replace FastSimpleCompareStrings with FastUnicodeCompare (which handles ignorable Unicode characters). Remove the wrapper routine, CompareUnicodeNames, and have its callers call FastUnicodeCompare directly. <CS31> 10/17/97 djb Change kUnicodeUseHFSPlusMapping to kUnicodeUseLatestMapping. <CS30> 10/17/97 msd Fix some type casts for char pointers. <CS29> 10/13/97 djb Add new SPIs for Finder View font (radar #1679073). <CS28> 10/1/97 djb Preserve current heap zone in InitializeEncodingContext routine (radar #1682686). <CS27> 9/17/97 djb Handle kTECPartialCharErr errors in ConvertHFSNameToUnicode. <CS26> 9/16/97 msd In MockConvertFromPStringToUnicode, use pragma unused instead of commenting out unused parameter (so SC will compile it). <CS25> 9/15/97 djb Fix MockConverters to do either 7-bit ascii or else mangle the name (radar #1672388). Use 'p2u#' resource for bootstrapping Unicode. Make sure InitializeEncodingContext uses System heap. <CS24> 9/10/97 msd Make InitializeEncodingContext public. <CS23> 9/7/97 djb Handle '�' char in BasicLatinUnicode converter. <CS22> 9/4/97 djb Add logging to BasicLatinUnicodeToPascal. <CS21> 8/26/97 djb Make FastSimpleCompareStrings faster. Add BasicLatinUnicodeToPascal to make 7-bit ascii conversions faster. <CS20> 8/14/97 djb Add FastRelString here (to be next to the data tables). <CS19> 7/21/97 djb LogEndTime now takes an error code. <CS18> 7/18/97 msd Include LowMemPriv.h, Gestalt.h, TextUtils.h. <CS17> 7/16/97 DSH FilesInternal.i renamed FileMgrInternal.i to avoid name collision <CS16> 7/8/97 DSH Loading PrecompiledHeaders from define passed in on C line <CS15> 7/8/97 DSH InitializeUnicode changed its API <CS14> 7/1/97 DSH SC, DFA complier, requires parameters in functions. #pragma'd them out to eliminate C warnings. <CS13> 6/30/97 msd Remove unused parameter warnings in FallbackProc by commenting out unused parameter names. <CS12> 6/26/97 DSH FallbackProc declare variables before useage for SC, MockConverters no longer static for DFA. <CS11> 6/25/97 msd In function InitStaticUnicodeConverter, the variable fsVars was being used before being initialized. <CS10> 6/24/97 DSH Runtime checks to call through CFM or static linked routines. <CS9> 6/20/97 msd Re-introduce fix from <CS7>. Fix another missing cast. Remove a spurious semicolon. <CS8> 6/18/97 djb Add more ConversionContexts routines. Improved file mangling. <CS7> 6/16/97 msd Add a missing cast in GetFileIDString. <CS6> 6/13/97 djb Added support for long filenames. Switched to ConvertUnicodeToHFSName, ConvertHFSNameToUnicode, and CompareUnicodeNames. <CS5> 6/4/97 djb Use system script instead of macRoman. <CS4> 5/19/97 djb Add call to LockMappingTable so tables won't move! <CS3> 5/9/97 djb Include HFSInstrumentation.h <CS2> 5/7/97 djb Add summary traces. Add FastSimpleCompareStrings routine. <CS1> 4/24/97 djb first checked in <HFS5> 3/27/97 djb Add calls to real Unicode conversion routines. <HFS4> 2/6/97 msd Add conditional code to use real Unicode comparison routines (default to off). <HFS3> 1/6/97 djb Fix HFSUnicodeCompare - the final comparison of length1 and length2 was backwards. <HFS2> 12/12/96 msd Use precompiled headers. <HFS1> 12/12/96 msd first checked in */ #include "../../hfs_macos_defs.h" #include "UCStringCompareData.h" #include "../headers/FileMgrInternal.h" #include "../headers/HFSUnicodeWrappers.h" #include "ConvertUTF.h" enum { kMinFileExtensionChars = 1, // does not include dot kMaxFileExtensionChars = 5 // does not include dot }; #define kASCIIPiSymbol 0xB9 #define kASCIIMicroSign 0xB5 #define kASCIIGreekDelta 0xC6 #define Is7BitASCII(c) ( (c) >= 0x20 && (c) <= 0x7F ) #define IsSpecialASCIIChar(c) ( (c) == (UInt8) kASCIIMicroSign || (c) == (UInt8) kASCIIPiSymbol || (c) == (UInt8) kASCIIGreekDelta ) // Note: '�' has two Unicode representations 0x00B5 (micro sign) and 0x03BC (greek) // '�' has two Unicode representations 0x2206 (increment) and 0x0394 (greek) #define IsSpecialUnicodeChar(c) ( (c) == 0x00B5 || (c) == 0x03BC || (c) == 0x03C0 || (c) == 0x2206 || (c) == 0x0394 ) #define IsHexDigit(c) ( ((c) >= (UInt8) '0' && (c) <= (UInt8) '9') || ((c) >= (UInt8) 'A' && (c) <= (UInt8) 'F') ) static void GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, Str15 extStr ); static void GetFileIDString( HFSCatalogNodeID fileID, Str15 fileIDStr ); static void AppendPascalString( ConstStr15Param src, Str31 dst ); static UInt32 HexStringToInteger( UInt32 length, const UInt8 *hexStr ); // // Get filename extension (if any) as a pascal string // #if TARGET_API_MAC_OS8 static void GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, Str15 extStr ) { UInt32 i; UniChar c; UInt16 extChars; // number of extension characters (excluding the dot) UInt16 maxExtChars; Boolean foundExtension; extStr[0] = (UInt8) 0; // assume there's no extension if ( length < 3 ) return; // sorry, "x.y" is smallest possible extension if ( length < (kMaxFileExtensionChars + 2) ) maxExtChars = length - 2; // we need at least one prefix character and dot else maxExtChars = kMaxFileExtensionChars; i = length; extChars = 0; foundExtension = false; while ( extChars <= maxExtChars ) { c = unicodeStr[--i]; if ( c == (UniChar) '.' ) // look for leading dot { if ( extChars > 0 ) // cannot end with a dot foundExtension = true; break; } if ( Is7BitASCII(c) || IsSpecialUnicodeChar(c) ) ++extChars; else break; } // if we found one then copy it if ( foundExtension ) { UInt8 *extStrPtr = extStr; const UniChar *unicodeStrPtr = &unicodeStr[i]; // point to dot char *(extStrPtr++) = extChars + 1; // set length to extension chars plus dot for ( i = 0; i <= extChars; ++i ) { c = *(unicodeStrPtr++); // map any special characters switch (c) { case 0x00B5: // micro sign case 0x03BC: // greek mu c = (UniChar) '�'; break; case 0x03C0: // greek pi c = (UniChar) '�'; break; case 0x2206: // increment sign case 0x0394: // greek capital delta c = (UniChar) '�'; break; } *(extStrPtr++) = (UInt8) c; // copy/convert to ascii } } } // end GetFilenameExtension #endif /* TARGET_API_MAC_OS8 */ // // Count filename extension characters (if any) // static UInt32 CountFilenameExtensionChars( const unsigned char * filename, UInt32 length ) { UInt32 i; UniChar c; UInt32 extChars; // number of extension characters (excluding the dot) UInt16 maxExtChars; Boolean foundExtension; if (length == kUndefinedStrLen) length = strlen(filename); if ( length < 3 ) return 0; // sorry, "x.y" is smallest possible extension if ( length < (kMaxFileExtensionChars + 2) ) maxExtChars = length - 2; // we need at least on prefix character and dot else maxExtChars = kMaxFileExtensionChars; extChars = 0; // assume there's no extension i = length - 1; // index to last ascii character foundExtension = false; while ( extChars <= maxExtChars ) { c = filename[i--]; if ( c == (UInt8) '.' ) // look for leading dot { if ( extChars > 0 ) // cannot end with a dot return (extChars); break; } if ( Is7BitASCII(c) || IsSpecialASCIIChar(c) ) ++extChars; else break; } return 0; } // end CountFilenameExtensionChars // // Convert file ID into a hexidecimal string with no leading zeros // #if TARGET_API_MAC_OS8 static void GetFileIDString( HFSCatalogNodeID fileID, Str15 fileIDStr ) { SInt32 i, b; static UInt8 *translate = (UInt8 *) "0123456789ABCDEF"; UInt8 c; fileIDStr[1] = '#'; for ( i = 1, b = 28; b >= 0; b -= 4 ) { c = *(translate + ((fileID >> b) & 0x0000000F)); // if its not a leading zero add it to our string if ( (c != (UInt8) '0') || (i > 1) || (b == 0) ) fileIDStr[++i] = c; } fileIDStr[0] = (UInt8) i; } // end GetFileIDString #endif /* TARGET_API_MAC_OS8 */ // // Append a suffix to a pascal string // #if TARGET_API_MAC_OS8 static void AppendPascalString( ConstStr15Param src, Str31 dst ) { UInt32 i, j; UInt32 srcLen; srcLen = StrLength(src); if ( (srcLen + StrLength(dst)) > 31 ) // safety net return; i = dst[0] + 1; // get end of dst for (j = 1; j <= srcLen; ++j) dst[i++] = src[j]; dst[0] += srcLen; } // end AppendPascalString #endif /* TARGET_API_MAC_OS8 */ HFSCatalogNodeID GetEmbeddedFileID(const unsigned char * filename, UInt32 length, UInt32 *prefixLength) { short extChars; short i; UInt8 c; // current character in filename *prefixLength = 0; if ( filename == NULL ) return 0; if (length == kUndefinedStrLen) length = strlen(filename); if ( length < 4 ) return 0; // too small to have a file ID if ( length >= 6 ) // big enough for a file ID (#10) and an extension (.x) ? extChars = CountFilenameExtensionChars(filename, length); else extChars = 0; if ( extChars > 0 ) length -= (extChars + 1); // skip dot plus extension characters // scan for file id digits... for ( i = length - 1; i >= 0; --i) { c = filename[i]; if ( c == '#' ) // look for file ID marker { if ( (length - i) < 3 ) break; // too small to be a file ID *prefixLength = i; return HexStringToInteger(length - i - 1, &filename[i+1]); } if ( !IsHexDigit(c) ) break; // file ID string must have hex digits } return 0; } // end GetEmbeddedFileID //_______________________________________________________________________ static UInt32 HexStringToInteger (UInt32 length, const UInt8 *hexStr) { UInt32 value; // decimal value represented by the string short i; UInt8 c; // next character in buffer const UInt8 *p; // pointer to character string value = 0; p = hexStr; for ( i = 0; i < length; ++i ) { c = *p++; if (c >= '0' && c <= '9') { value = value << 4; value += (UInt32) c - (UInt32) '0'; } else if (c >= 'A' && c <= 'F') { value = value << 4; value += 10 + ((unsigned int) c - (unsigned int) 'A'); } else { return 0; // oops, how did this character get in here? } } return value; } // end HexStringToInteger //_______________________________________________________________________ // // Routine: FastRelString // // Output: returns -1 if str1 < str2 // returns 1 if str1 > str2 // return 0 if equal // //_______________________________________________________________________ extern unsigned short gCompareTable[]; SInt32 FastRelString( ConstStr255Param str1, ConstStr255Param str2 ) { UInt16* compareTable; SInt32 bestGuess; UInt8 length, length2; UInt8 delta; delta = 0; length = *(str1++); length2 = *(str2++); if (length == length2) bestGuess = 0; else if (length < length2) { bestGuess = -1; delta = length2 - length; } else { bestGuess = 1; length = length2; } compareTable = (UInt16*) gCompareTable; while (length--) { UInt8 aChar, bChar; aChar = *(str1++); bChar = *(str2++); if (aChar != bChar) // If they don't match exacly, do case conversion { UInt16 aSortWord, bSortWord; aSortWord = compareTable[aChar]; if (bChar == 0 && delta == 1) { bChar = *(str2++); /* skip over embedded null */ bestGuess = 0; } bSortWord = compareTable[bChar]; if (aSortWord > bSortWord) return 1; if (aSortWord < bSortWord) return -1; } // If characters match exactly, then go on to next character immediately without // doing any extra work. } // if you got to here, then return bestGuess return bestGuess; } // // FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering // // IF RESULT // -------------------------- // str1 < str2 => -1 // str1 = str2 => 0 // str1 > str2 => +1 // // The lower case table starts with 256 entries (one for each of the upper bytes // of the original Unicode char). If that entry is zero, then all characters with // that upper byte are already case folded. If the entry is non-zero, then it is // the _index_ (not byte offset) of the start of the sub-table for the characters // with that upper byte. All ignorable characters are folded to the value zero. // // In pseudocode: // // Let c = source Unicode character // Let table[] = lower case table // // lower = table[highbyte(c)] // if (lower == 0) // lower = c // else // lower = table[lower+lowbyte(c)] // // if (lower == 0) // ignore this character // // To handle ignorable characters, we now need a loop to find the next valid character. // Also, we can't pre-compute the number of characters to compare; the string length might // be larger than the number of non-ignorable characters. Further, we must be able to handle // ignorable characters at any point in the string, including as the first or last characters. // We use a zero value as a sentinel to detect both end-of-string and ignorable characters. // Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename, // the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is // an invalid Unicode character). // // Pseudocode: // // while (1) { // c1 = GetNextValidChar(str1) // returns zero if at end of string // c2 = GetNextValidChar(str2) // // if (c1 != c2) break // found a difference // // if (c1 == 0) // reached end of string on both strings at once? // return 0; // yes, so strings are equal // } // // // When we get here, c1 != c2. So, we just need to determine which one is less. // if (c1 < c2) // return -1; // else // return 1; // extern UInt16 gLowerCaseTable[]; extern UInt16 gLatinCaseFold[]; SInt32 FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1, register ConstUniCharArrayPtr str2, register ItemCount length2) { register UInt16 c1,c2; register UInt16 temp; register UInt16* lowerCaseTable; lowerCaseTable = (UInt16*) gLowerCaseTable; while (1) { /* Set default values for c1, c2 in case there are no more valid chars */ c1 = 0; c2 = 0; /* Find next non-ignorable char from str1, or zero if no more */ while (length1 && c1 == 0) { c1 = *(str1++); --length1; /* check for basic latin first */ if (c1 < 0x0100) { c1 = gLatinCaseFold[c1]; break; } /* case fold if neccessary */ if ((temp = lowerCaseTable[c1>>8]) != 0) c1 = lowerCaseTable[temp + (c1 & 0x00FF)]; } /* Find next non-ignorable char from str2, or zero if no more */ while (length2 && c2 == 0) { c2 = *(str2++); --length2; /* check for basic latin first */ if (c2 < 0x0100) { if ((c2 = gLatinCaseFold[c2]) != 0) break; else continue; /* ignore this character */ } /* case fold if neccessary */ if ((temp = lowerCaseTable[c2>>8]) != 0) c2 = lowerCaseTable[temp + (c2 & 0x00FF)]; } if (c1 != c2) // found a difference, so stop looping break; if (c1 == 0) // did we reach the end of both strings at the same time? return 0; // yes, so strings are equal } if (c1 < c2) return -1; else return 1; } OSErr ConvertUTF8ToUnicode(ByteCount srcLen, const unsigned char* srcStr, ByteCount maxDstLen, ByteCount *actualDstLen, UniCharArrayPtr dstStr) { ConversionResult result; UTF8* sourceStart; UTF8* sourceEnd; UTF16* targetStart; UTF16* targetEnd; sourceStart = (UTF8*) srcStr; sourceEnd = sourceStart + srcLen; targetStart = (UTF16*) dstStr; targetEnd = targetStart + maxDstLen/2; result = ConvertUTF8toUTF16 (&sourceStart, sourceEnd, &targetStart, targetEnd); *actualDstLen = (targetStart - dstStr) * sizeof(UniChar); if (result == targetExhausted) return kTECOutputBufferFullStatus; else if (result == sourceExhausted) return kTextMalformedInputErr; return noErr; } OSErr ConvertUnicodeToUTF8(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen, ByteCount *actualDstLen, unsigned char* dstStr) { ConversionResult result; UTF16* sourceStart; UTF16* sourceEnd; UTF8* targetStart; UTF8* targetEnd; ByteCount outputLength; sourceStart = (UTF16*) srcStr; sourceEnd = (UTF16*) ((char*) srcStr + srcLen); targetStart = (UTF8*) dstStr; targetEnd = targetStart + maxDstLen; result = ConvertUTF16toUTF8 (&sourceStart, sourceEnd, &targetStart, targetEnd); *actualDstLen = outputLength = targetStart - dstStr; if (result == targetExhausted) return kTECOutputBufferFullStatus; else if (result == sourceExhausted) return kTECPartialCharErr; if (outputLength >= maxDstLen) return kTECOutputBufferFullStatus; dstStr[outputLength] = 0; /* also add null termination */ return noErr; }

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.