test-client-godot/modules/bitstream/convert_utf16.cpp

/*
 * Convert a UTF-16 string to UTF-8, mapping indices to provide low-complexity
 * range and index lookups.
 *
 * Copyright 2010 Rasmus Andersson. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "convert_utf16.h"
#include "core/array.h"
#include "core/variant.h"
#include "core/int_types.h"
#include "modules/debug/debug.h"

// ----------------------------------------------------------------------------
// Macros extracted from icu/unicode/utf16.h

/**
 * Is this code unit a lead surrogate (U+d800..U+dbff)?
 * @param c 16-bit code unit
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)

/**
 * Is this code unit a trail surrogate (U+dc00..U+dfff)?
 * @param c 16-bit code unit
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)

/**
 * Helper constant for U16_GET_SUPPLEMENTARY. (0x35fdc00)
 * @internal
 */
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)

/**
 * Get a supplementary code point value (U+10000..U+10ffff)
 * from its lead and trail surrogates.
 * The result is undefined if the input values are not
 * lead and trail surrogates.
 *
 * @param lead lead surrogate (U+d800..U+dbff)
 * @param trail trail surrogate (U+dc00..U+dfff)
 * @return supplementary code point (U+10000..U+10ffff)
 * @stable ICU 2.4
 */
#define U16_GET_SUPPLEMENTARY(lead, trail) \
    (((uint32_t)(lead)<<10UL)+(uint32_t)(trail)-U16_SURROGATE_OFFSET)

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Unsafe" macro, assumes well-formed UTF-16.
 *
 * The offset may point to the lead surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the following trail surrogate as well.
 * If the offset points to a trail surrogate, then that itself
 * will be returned as the code point.
 * The result is undefined if the offset points to a single, unpaired lead surrogate.
 *
 * @param s const UChar * string
 * @param i string offset
 * @param c output uint32_t variable
 * @see U16_NEXT
 * @stable ICU 2.4
 */
#define U16_NEXT_UNSAFE(s, i, c) { \
    (c)=(s)[(i)++]; \
    if(U16_IS_LEAD(c)) { \
        (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
    } \
}

/**
 * Get a code point from a string at a code point boundary offset,
 * and advance the offset to the next code point boundary.
 * (Post-incrementing forward iteration.)
 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
 *
 * The offset may point to the lead surrogate unit
 * for a supplementary code point, in which case the macro will read
 * the following trail surrogate as well.
 * If the offset points to a trail surrogate or
 * to a single, unpaired lead surrogate, then that itself
 * will be returned as the code point.
 *
 * @param s const UChar * string
 * @param i string offset, must be i<length
 * @param length string length
 * @param c output UChar32 variable
 * @see U16_NEXT_UNSAFE
 * @stable ICU 2.4
 */
#define U16_NEXT(s, i, length, c) { \
    (c)=(s)[(i)++]; \
    if(U16_IS_LEAD(c)) { \
        uint16_t __c2; \
        if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
            ++(i); \
            (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
        } \
    } \
}

// end of icu/unicode/utf16.h
// ----------------------------------------------------------------------------

// convert_utf16_to_utf8 based on HUTF8MappedUTF16String.mm

String convert_utf16_to_utf8(uint8_t * u16_buf, size_t u16_len)
{
    String  out;
    int u8_len = 0;
    size_t * u8to16_table = new size_t[u16_len*4];
    char *u8buf = new char[u16_len*4+1];

    // For each UTF-16 character...
    for (size_t u16i=0; u16i < u16_len; )
    {
        // Retrieve 1-2 UTF-16 characters, forming one 32-bit unicode character
        uint32_t u32c = 0;
        size_t u16i_next = u16i;
        // slower, but "safer"
        // U16_NEXT(u16_buf_, u16i_next, u16_len_, u32c);
        // faster, but does not handle unpaired surrogates or checks bounds
        U16_NEXT_UNSAFE(u16_buf, u16i_next, u32c);

        // u16 offset added to |u8to16_table|
        size_t u16ix = u16i;

        // Append u32c to u8buf (1-4 bytes)
        if ((uint32_t)u32c <= 0x7f)
        {
            u8to16_table[u8_len] = u16ix;
            u8buf[u8_len++] = (uint8_t)u32c;
            {
                CharString c;
                c += u8buf[u8_len - 1];
            }
        } else {
            if ((uint32_t)u32c <= 0x7ff)
            {
                u8to16_table[u8_len] = u16ix;
                u8buf[u8_len++] = (uint8_t)((u32c>>6)|0xc0);
            } else {
                if ((uint32_t)u32c <= 0xffff)
                {
                    u8to16_table[u8_len] = u16ix;
                    u8buf[u8_len++] = (uint8_t)((u32c>>12)|0xe0);
                } else {
                    u8to16_table[u8_len] = u16ix;
                    u8buf[u8_len++] = (uint8_t)((u32c>>18)|0xf0);
                    u8to16_table[u8_len] = u16ix;
                    u8buf[u8_len++] = (uint8_t)(((u32c>>12)&0x3f)|0x80);
                }
                u8to16_table[u8_len] = u16ix;
                u8buf[u8_len++] = (uint8_t)(((u32c>>6)&0x3f)|0x80);
            }
            u8to16_table[u8_len] = u16ix;
            u8buf[u8_len++] = (uint8_t)((u32c&0x3f)|0x80);
        }
        u16i = u16i_next;
    }
    delete [] u8to16_table;
    u8buf[u8_len++] = 0;

    out = String::utf8(u8buf);
    delete [] u8buf;

    return out;
}

void convert_utf8_to_utf16(String input, uint16_t ** u16_buf, size_t * u16_len)
{
    Array unicode;
    CharString utf8 = input.utf8();
    int i = 0;

#ifdef DEBUG_ENABLED
    if ( u16_len == NULL )
    {
        ERR_PRINT("Bad ptr for u16_len");
        throw "Bad ptr for u16_len";
    }
    if ( u16_buf == NULL )
    {
        ERR_PRINT("Bad ptr for u16_buf");
        throw "Bad ptr for u16_buf";
    }
    if ( *u16_buf != NULL )
    {
        ERR_PRINT("Bad ptr u16_buf not empty");
        throw "Bad ptr u16_buf not empty";
    }
#endif

    while (i < utf8.size())
    {
        uint32_t uni;
        size_t todo;
        //bool error = false;
        unsigned char ch = (unsigned char) utf8[i++];
        if (ch <= 0x7F)
        {
            uni = ch;
            todo = 0;
        }
        else if (ch <= 0xBF)
        {
            ERR_PRINT("not a UTF-8 string");
            *u16_len = 0;
            return;
        }
        else if (ch <= 0xDF)
        {
            uni = ch&0x1F;
            todo = 1;
        }
        else if (ch <= 0xEF)
        {
            uni = ch&0x0F;
            todo = 2;
        }
        else if (ch <= 0xF7)
        {
            uni = ch&0x07;
            todo = 3;
        }
        else
        {
            ERR_PRINT("not a UTF-8 string");
            *u16_len = 0;
            return;
        }
        for (size_t j = 0; j < todo; ++j)
        {
            if (i == utf8.size())
            {
                ERR_PRINT("not a UTF-8 string");
                *u16_len = 0;
                return;
            }
            ch = (unsigned char) utf8[i++];
            if (ch < 0x80 || ch > 0xBF)
            {
                ERR_PRINT("not a UTF-8 string");
                *u16_len = 0;
                return;
            }
            uni <<= 6;
            uni += ch & 0x3F;
        }
        if (uni >= 0xD800 && uni <= 0xDFFF)
        {
            ERR_PRINT("not a UTF-8 string");
            *u16_len = 0;
            return;
        }
        if (uni > 0x10FFFF)
        {
            ERR_PRINT("not a UTF-8 string");
            *u16_len = 0;
            return;
        }
        Variant data_pushed(uni);
        unicode.push_back(data_pushed);
    }

    *u16_len = 0;
    for (i = 0; i < unicode.size(); ++i)
    {
        unsigned long uni = unicode[i];
        if (uni <= 0xFFFF)
        {
            (*u16_len) ++;
        }
        else
        {
            (*u16_len) ++;
            (*u16_len) ++;

        }
    }

    *u16_buf = new uint16_t[*u16_len];
    (*u16_len)--;
    size_t ii = 0;
    DBG_PRINT("array (max:" + itos(*u16_len));
    for (i = 0; i < unicode.size(); ++i)
    {
        unsigned long uni = unicode[i];
        DBG_PRINT("array (max:" + itos(*u16_len) + " / i:" + itos(i) + " / current:" + itos(ii) + ")");
        if (uni <= 0xFFFF)
        {
            (*u16_buf)[ii++] = uni;
        }
        else
        {
            uni -= 0x10000;
            (*u16_buf)[ii++] = (uint16_t)((uni >> 10) + 0xD800);
            (*u16_buf)[ii++] = (uint16_t)((uni & 0x3FF) + 0xDC00);
        }
    }
    ii--;
#ifdef DEBUG_ENABLED
    if(ii != *u16_len)
    {
        ERR_PRINT("Out of array (max:" + itos(*u16_len) + " / current:" + itos(ii) + ")");
        throw "Out of array";
    }
#endif
}