如何在 C++ 中将字符串从十进制代码转换为西里尔文/unicode16?
Posted
技术标签:
【中文标题】如何在 C++ 中将字符串从十进制代码转换为西里尔文/unicode16?【英文标题】:How to convert string from decimal code to cyrillic / unicode16 in C++? 【发布时间】:2021-02-03 21:54:33 【问题描述】:在我的 c++ 程序中,我有一个“十进制代码字符串”,我想使用 unicode 字符将它打印到一个文件中。
这是十进制代码字符串:квартира
这是西里尔解码后的字符串:Квартира
这是我的代码:
#include "stdafx.h"
#include<iostream>
#include<algorithm>
#include<cmath>
#include<cstdio>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
// string I have:
string decimal_code_string = "квартира";
FILE *stream;
fopen_s( &stream, "C:\\Users\\Luze\\Desktop\\myFile.txt", "w+, ccs=UTF-8" );
//printf("--> %s", decimal_code_string.c_str());
// string I would like to write: "Квартира"
fwprintf(stream, L"%hs", decimal_code_string.c_str());
fclose(stream);
return 0;
[编辑]
我尝试了@MarkRansom 解决方案,但它似乎不起作用:
#include "stdafx.h"
#include<iostream>
#include<algorithm>
#include<cmath>
#include<cstdio>
using namespace std;
std::string wchar_to_UTF8(const wchar_t * in)
std::string out;
unsigned int codepoint = 0;
for (in; *in != 0; ++in)
if (*in >= 0xd800 && *in <= 0xdbff)
codepoint = ((*in - 0xd800) << 10) + 0x10000;
else
if (*in >= 0xdc00 && *in <= 0xdfff)
codepoint |= *in - 0xdc00;
else
codepoint = *in;
if (codepoint <= 0x7f)
out.append(1, static_cast<char>(codepoint));
else if (codepoint <= 0x7ff)
out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
else if (codepoint <= 0xffff)
out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
else
out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
codepoint = 0;
return out;
std::wstring UTF8_to_wchar(const char * in)
std::wstring out;
unsigned int codepoint;
while (*in != 0)
unsigned char ch = static_cast<unsigned char>(*in);
if (ch <= 0x7f)
codepoint = ch;
else if (ch <= 0xbf)
codepoint = (codepoint << 6) | (ch & 0x3f);
else if (ch <= 0xdf)
codepoint = ch & 0x1f;
else if (ch <= 0xef)
codepoint = ch & 0x0f;
else
codepoint = ch & 0x07;
++in;
if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff))
if (sizeof(wchar_t) > 2)
out.append(1, static_cast<wchar_t>(codepoint));
else if (codepoint > 0xffff)
out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10)));
out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff)));
else if (codepoint < 0xd800 || codepoint >= 0xe000)
out.append(1, static_cast<wchar_t>(codepoint));
return out;
int _tmain(int argc, _TCHAR* argv[])
string decimal_code_string = "квартира";
printf("decimal_code: %s\n", decimal_code_string.c_str());
// string I would like to write: "Квартира"
wstring temp_string = wstring(decimal_code_string.begin(), decimal_code_string.end());
const wchar_t* result = temp_string.c_str();
string utf8_string= wchar_to_UTF8( result );
printf("wchar_to_UTF8: %s\n", utf8_string);
const char* resultcp = decimal_code_string.c_str();
wstring wide_string = UTF8_to_wchar( resultcp );
printf("UTF8_to_wchar: %s\n", wide_string);
cout << "End\n";
return 0;
【问题讨论】:
这能回答你的问题吗? How to decode html Entities in C? @trentcl 该问题的最高投票答案转换为 UTF-8,而不是宽字符。 @MarkRansom 可以适配UTF-16等 @tadman 是的,我知道,我什至在 *** 上有一些代码可以做到这一点:***.com/a/148766/5987。看起来有些人急于将其作为副本关闭,但事实并非如此。 我链接到的代码并不是整个解决方案,而是要与@trentcl 提供的答案相结合。 【参考方案1】:只需提取每个字符内的数字实体:这正是该字符的 Unicode/UTF32 代码。
提取该字符串后,您可以使用任何 UTF32 到 UTF8 字符串转换算法,例如实验性的 StringSuite C++20 库。
【讨论】:
以上是关于如何在 C++ 中将字符串从十进制代码转换为西里尔文/unicode16?的主要内容,如果未能解决你的问题,请参考以下文章
如何在 Java String 中将西里尔字母转换为英文拉丁文?
如何在 C++ 中将十六进制字符串转换为字节字符串? [复制]
如何在 Visual C++ 中将字节数组转换为十六进制字符串?