utf8 to unicode

Posted littletiger

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了utf8 to unicode相关的知识,希望对你有一定的参考价值。

json utf8 to unicode (stm32 发烧群友提供),仅留做参考,不保证其准确及可用。

u32 UTF8_to_Unicode(char *dst, char *src)  //json utf8 to unicode
{
    u32 i = 0, unicode = 0, ii, iii;
    int codeLen = 0;
    
    while ( *src )
    {
        //1. UTF-8 ---> Unicode
        if(0 == (src[0] & 0x80))
        {
            // 单字节
            codeLen = 1;
            unicode = src[0];
        }
        else if(0xC0 == (src[0] & 0xE0) && 0x80 == (src[1] & 0xC0))
        {// 双字节
            codeLen = 2;
            unicode = (u32)((((u32)src[0] & 0x001F) << 6) | ((u32)src[1] & 0x003F));
        }
        else if(0xE0 == (src[0] & 0xF0) && 0x80 == (src[1] & 0xC0) && 0x80 == (src[2] & 0xC0))
        {// 三字节
            codeLen = 3;
            ii = (((u32)src[0] & 0x000F) << 12);
            iii = (((u32)src[1] & 0x003F) << 6);
            unicode = ii|iii|((u32)src[2] & 0x003F);
            unicode = (u32)((((u32)src[0] & 0x000F) << 12) | (((u32)src[1] & 0x003F) << 6) | ((u32)src[2] & 0x003F));
        }
        else if(0xF0 == (src[0] & 0xF0) && 0x80 == (src[1] & 0xC0) && 0x80 == (src[2] & 0xC0) && 0x80 == (src[3] & 0xC0))
        {// 四字节
            codeLen = 4;
            unicode = (((int)(src[0] & 0x07)) << 18) | (((int)(src[1] & 0x3F)) << 12) | (((int)(src[2] & 0x3F)) << 6) | (src[3] & 0x3F);
        }
        else
        {
            break;
        }
        src += codeLen;
        if (unicode < 0x80)
        {
            if (i == 0 && unicode == 0x20)
            {
                continue;
            }
        }
        i += 2;
        *dst++ = (u8)((unicode&0xff));
        *dst++ = (u8)(((unicode>>8)&0xff));
    } // end while
    *dst = 0;
    
    return i;
}

 

以上是关于utf8 to unicode的主要内容,如果未能解决你的问题,请参考以下文章

该死的编码

python2.7运行出现的Warning: UnicodeWarning: Unicode equal comparison failed to convert both arguments to

utf8 unicode 编码互转

php emoji utf8转unicode

MySQL Convert latin1 to utf8, cp1252 0x80-0x9F 错误

cstring to utf8