/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "nsUTF16ToUnicode.h" #include "nsCharTraits.h" #include "mozilla/CheckedInt.h" #include "mozilla/EndianUtils.h" enum { STATE_NORMAL = 0, STATE_HALF_CODE_POINT = 1, STATE_FIRST_CALL = 2, STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT, STATE_ODD_SURROGATE_PAIR = 4 }; nsresult nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc, int32_t * aSrcLength, char16_t * aDest, int32_t * aDestLength, bool aSwapBytes) { const char* src = aSrc; const char* srcEnd = aSrc + *aSrcLength; char16_t* dest = aDest; char16_t* destEnd = aDest + *aDestLength; char16_t oddHighSurrogate; switch(mState) { case STATE_FIRST_CALL: NS_ASSERTION(*aSrcLength > 1, "buffer too short"); src+=2; mState = STATE_NORMAL; break; case STATE_SECOND_BYTE: NS_ASSERTION(*aSrcLength > 0, "buffer too short"); src++; mState = STATE_NORMAL; break; case STATE_ODD_SURROGATE_PAIR: if (*aDestLength < 2) goto error; else { *dest++ = mOddHighSurrogate; *dest++ = mOddLowSurrogate; mOddHighSurrogate = mOddLowSurrogate = 0; mState = STATE_NORMAL; } break; case STATE_NORMAL: case STATE_HALF_CODE_POINT: default: break; } oddHighSurrogate = mOddHighSurrogate; if (src == srcEnd) { *aDestLength = dest - aDest; return (mState != STATE_NORMAL || oddHighSurrogate) ? NS_OK_UDEC_MOREINPUT : NS_OK; } const char* srcEvenEnd; char16_t u; if (mState == STATE_HALF_CODE_POINT) { if (dest == destEnd) goto error; // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the // previous run while the 2nd byte has to come from |*src|. mState = STATE_NORMAL; #if MOZ_BIG_ENDIAN u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte. #else u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte. #endif srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop goto have_codepoint; } else { srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop } while (src != srcEvenEnd) { if (dest == destEnd) goto error; #if !defined(__sparc__) && !defined(__arm__) u = *(const char16_t*)src; #else memcpy(&u, src, 2); #endif src += 2; have_codepoint: if (aSwapBytes) u = u << 8 | u >> 8; if (!IS_SURROGATE(u)) { if (oddHighSurrogate) { if (mErrBehavior == kOnError_Signal) { goto error2; } *dest++ = UCS2_REPLACEMENT_CHAR; if (dest == destEnd) goto error; oddHighSurrogate = 0; } *dest++ = u; } else if (NS_IS_HIGH_SURROGATE(u)) { if (oddHighSurrogate) { if (mErrBehavior == kOnError_Signal) { goto error2; } *dest++ = UCS2_REPLACEMENT_CHAR; if (dest == destEnd) goto error; } oddHighSurrogate = u; } else /* if (NS_IS_LOW_SURROGATE(u)) */ { if (oddHighSurrogate && *aDestLength > 1) { if (dest + 1 >= destEnd) { mOddLowSurrogate = u; mOddHighSurrogate = oddHighSurrogate; mState = STATE_ODD_SURROGATE_PAIR; goto error; } *dest++ = oddHighSurrogate; *dest++ = u; } else { if (mErrBehavior == kOnError_Signal) { goto error2; } *dest++ = UCS2_REPLACEMENT_CHAR; } oddHighSurrogate = 0; } } if (src != srcEnd) { // store the lead byte of a 16-bit unit for the next run. mOddByte = *src++; mState = STATE_HALF_CODE_POINT; } mOddHighSurrogate = oddHighSurrogate; *aDestLength = dest - aDest; *aSrcLength = src - aSrc; return (mState != STATE_NORMAL || oddHighSurrogate) ? NS_OK_UDEC_MOREINPUT : NS_OK; error: *aDestLength = dest - aDest; *aSrcLength = src - aSrc; return NS_OK_UDEC_MOREOUTPUT; error2: *aDestLength = dest - aDest; *aSrcLength = --src - aSrc; return NS_ERROR_ILLEGAL_INPUT; } NS_IMETHODIMP nsUTF16ToUnicodeBase::Reset() { mState = STATE_FIRST_CALL; mOddByte = 0; mOddHighSurrogate = 0; mOddLowSurrogate = 0; return NS_OK; } NS_IMETHODIMP nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength, int32_t * aDestLength) { mozilla::CheckedInt32 length = aSrcLength; if (STATE_HALF_CODE_POINT & mState) { length += 1; } if (!length.isValid()) { return NS_ERROR_OUT_OF_MEMORY; } // the left-over data of the previous run have to be taken into account. *aDestLength = length.value() / 2; if (mOddHighSurrogate) (*aDestLength)++; if (mOddLowSurrogate) (*aDestLength)++; return NS_OK; } NS_IMETHODIMP nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, char16_t * aDest, int32_t * aDestLength) { switch (mState) { case STATE_FIRST_CALL: if (*aSrcLength < 2) { if (*aSrcLength < 1) { *aDestLength = 0; return NS_OK; } if (uint8_t(*aSrc) != 0xFE) { mState = STATE_NORMAL; break; } *aDestLength = 0; mState = STATE_SECOND_BYTE; return NS_OK_UDEC_MOREINPUT; } #if MOZ_LITTLE_ENDIAN // on LE machines, BE BOM is 0xFFFE if (0xFFFE != *((char16_t*)aSrc)) { mState = STATE_NORMAL; } #else if (0xFEFF != *((char16_t*)aSrc)) { mState = STATE_NORMAL; } #endif break; case STATE_SECOND_BYTE: if (*aSrcLength < 1) { *aDestLength = 0; return NS_OK_UDEC_MOREINPUT; } if (uint8_t(*aSrc) != 0xFF) { mOddByte = 0xFE; mState = STATE_HALF_CODE_POINT; } break; } return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, bool(MOZ_LITTLE_ENDIAN)); } NS_IMETHODIMP nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, char16_t * aDest, int32_t * aDestLength) { switch (mState) { case STATE_FIRST_CALL: if (*aSrcLength < 2) { if (*aSrcLength < 1) { *aDestLength = 0; return NS_OK; } if (uint8_t(*aSrc) != 0xFF) { mState = STATE_NORMAL; break; } *aDestLength = 0; mState = STATE_SECOND_BYTE; return NS_OK_UDEC_MOREINPUT; } #if MOZ_BIG_ENDIAN // on BE machines, LE BOM is 0xFFFE if (0xFFFE != *((char16_t*)aSrc)) { mState = STATE_NORMAL; } #else if (0xFEFF != *((char16_t*)aSrc)) { mState = STATE_NORMAL; } #endif break; case STATE_SECOND_BYTE: if (*aSrcLength < 1) { *aDestLength = 0; return NS_OK_UDEC_MOREINPUT; } if (uint8_t(*aSrc) != 0xFE) { mOddByte = 0xFF; mState = STATE_HALF_CODE_POINT; } break; } return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, bool(MOZ_BIG_ENDIAN)); } NS_IMETHODIMP nsUTF16ToUnicode::Reset() { mEndian = kUnknown; mFoundBOM = false; return nsUTF16ToUnicodeBase::Reset(); } NS_IMETHODIMP nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, char16_t * aDest, int32_t * aDestLength) { if(STATE_FIRST_CALL == mState && *aSrcLength < 2) { nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT; *aSrcLength=0; *aDestLength=0; return res; } if(STATE_FIRST_CALL == mState) // first time called { // check if BOM (0xFEFF) is at the beginning, remove it if found, and // set mEndian accordingly. if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) { mEndian = kLittleEndian; mFoundBOM = true; } else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) { mEndian = kBigEndian; mFoundBOM = true; } // BOM is not found, but we can use a simple heuristic to determine // the endianness. Assume the first character is [U+0001, U+00FF]. // Not always valid, but it's very likely to hold for html/xml/css. else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00) mState = STATE_NORMAL; mEndian = kBigEndian; } else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00) mState = STATE_NORMAL; mEndian = kLittleEndian; } else { // Neither BOM nor 'plausible' byte patterns at the beginning. // Just assume it's BE (following Unicode standard) // and let the garbage show up in the browser. (security concern?) // (bug 246194) mState = STATE_NORMAL; mEndian = kBigEndian; } } nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, #if MOZ_BIG_ENDIAN (mEndian == kLittleEndian) #else (mEndian == kBigEndian) #endif ); // If BOM is not found and we're to return NS_OK, signal that BOM // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode| return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv; }