utf8 string

Posted 花式撸管手

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了utf8 string相关的知识,希望对你有一定的参考价值。

https://github.com/BassLC/idUTF8lib

 

Idiot‘s UTF-8 Library

A very (too much really) simple Utf8 library for C++

Usage

#include "lib/idutf8lib.hpp"

Utf8String text; //Empty UTF8 object
Utf8String utf8_text("Hé??ò ?órld"); //std::string compatible constructor
text = "Jello!"; //Supports assignment with std::string AND Utf8String objects
text.to_string(); // == std::string("Jello!")

utf8_text.size_in_chars(); // == 11
utf8_text.size_in_bytes(); // == 18 

utf8_text[0]; // == std::string("H")
utf8_text.sub_utf8str(1,3); // == Utf8String("é??")

Features

  • Decodes and parses UTF-8 strings correctly (at least until now)
  • Very lightweight and small: less than 200 newlines total (*without counting tests)

Requirements

  • A C++14 compatible compiler

Notes

Makefile serves only for testing purposes.

Uses the Catch framework for tests.

Thanks

UTF8-CPP

tiny-utf8

 

#ifndef UTF8_CPP
#define UTF8_CPP

#include <string>
#include <vector>


class Utf8String {

private:
    using Utf8Struct = std::vector<std::vector<uint8_t>>;
    
    Utf8Struct content;

    bool is_valid_utf8_string(const std::string &string) const;

public:
    Utf8String() = default;
    Utf8String(const Utf8String &) = default;
    Utf8String(Utf8Struct &&content);
    Utf8String(const std::string &string);
    ~Utf8String() = default;
    
    std::string to_string() const;
    std::size_t size_in_chars() const;
    std::size_t size_in_bytes() const;
    void clear();
    Utf8String sub_utf8str(const std::size_t &initial_pos, const std::size_t &distance = std::string::npos) const;

    void operator=(const std::string &string);
    void operator=(const Utf8String &utf8_structure) noexcept;
    
    Utf8String operator+(const Utf8String &utf8_structure) const noexcept;
    void operator+=(const Utf8String &utf8_structure) noexcept;
    
    std::string operator[](const std::size_t &pos) const;

    friend std::ostream& operator<<(std::ostream &out, const Utf8String &utf8_structure) noexcept;

    bool operator==(const Utf8String &utf8_structure) const noexcept;
    bool operator==(const std::string &string) const noexcept;
};

#endif

 

 

#include "idutf8lib.hpp"
#include <iostream>
#include <bitset>
#include <exception>

//* Private functions *

bool Utf8String::is_valid_utf8_string(const std::string &string) const {
    for ( std::size_t pos = 0; pos < string.size(); ++pos ) {
        
        //IMPORTANT: The way you access a bitset object is completely backwards.
         //EXAMPLE: bitset = 0b10; bitset[0] == 0
        std::bitset<4> bits = string[pos] >> 4;

        //ASCII character
        if ( bits[3] == 0 ) {
            continue;
            
        //Continuation character - should NOT be here
        } else if ( bits[3] == 1 && bits[2] == 0 ){
            return false;

        } else {
            
            //Check number of characters
            while ( (bits <<= 1)[3] ) {
                if ( ++pos >= string.size() ) {
                    return false;
                }

                if ( std::bitset<2>(string[pos] >> 6) != 0b10 ) {
                    return false;
                }
            }
        }
    }

    return true;
}


//* Constructors *

Utf8String::Utf8String(const std::string &string) {
    std::vector<uint8_t> utf8_char;
 
    if ( !is_valid_utf8_string(string) ) {
        throw(std::runtime_error("Invalid UTF8 String in constructor")); 
    }
    
    for ( const auto &chr : string ) {
        std::bitset<2> start_bits = (chr >> 6);
        
        if ( start_bits[1] == 0 ) {

            //ASCII character is pushed after making sure of the character before
            if ( !utf8_char.empty() ) {
                content.push_back(utf8_char);
                utf8_char.clear();
            }
            
            content.push_back(std::vector<uint8_t>(1, chr));
            continue;
            
        //If there‘s more than one byte    
        } else if ( start_bits == 0b11 ) {

            //Check to see if it has to flush the last character
            if ( !utf8_char.empty() ) {
                content.push_back(utf8_char);
                utf8_char.clear();
            }
        }

        utf8_char.push_back(chr);
    }

    //If last character is non-ASCII
    if ( !utf8_char.empty() ) {
        content.push_back(utf8_char);
    }
}


Utf8String::Utf8String(Utf8Struct &&temp) {
    content = temp;
}


//* Public Interface *

std::string Utf8String::to_string() const {
    std::string temp;

    for ( const auto &chr : content ) {
        temp += std::string(chr.begin(), chr.end());
    }

    return temp;
}


std::size_t Utf8String::size_in_chars() const { return content.size(); }


std::size_t Utf8String::size_in_bytes() const {
    std::size_t size = 0;

    for ( const auto &chr : content ) {
        size += chr.size();
    }

    return size;
}


void Utf8String::clear() { content.clear(); }


Utf8String Utf8String::sub_utf8str(const std::size_t &initial_pos, const std::size_t &distance) const {

    const std::size_t end_pos = (distance == std::string::npos) ? content.size() : (initial_pos + distance);

    // To be sure we don‘t try to overflow
    if ( initial_pos >= content.size() || end_pos > content.size() ){
        throw std::out_of_range("Too big substr access");
    }


    return Utf8String(Utf8Struct(content.begin()+initial_pos, content.begin()+end_pos));
}


//* Operators *

void Utf8String::operator=(const std::string &string) {
    Utf8String temp(string);
    content = temp.content;
}


void Utf8String::operator=(const Utf8String &utf8_object) noexcept { content = utf8_object.content; }


std::string Utf8String::operator[](const std::size_t &pos) const {
    if ( pos >= content.size() ) {
        throw std::out_of_range("Bad UTF-8 range access with []");
    }

    return std::string(content[pos].begin(), content[pos].end());
}


Utf8String Utf8String::operator+(const Utf8String &utf8_structure) const noexcept {
    Utf8Struct temp = content;
    temp.insert(std::end(temp), std::begin(utf8_structure.content), std::end(utf8_structure.content));
    return Utf8String(std::move(temp));
}


void Utf8String::operator+=(const Utf8String &utf8_structure) noexcept {
    content.insert(std::end(content), std::begin(utf8_structure.content), std::end(utf8_structure.content));
}


std::ostream& operator<<(std::ostream &out, const Utf8String &utf8_structure) noexcept{
    out << utf8_structure.to_string();
    return out;
}


bool Utf8String::operator==(const Utf8String &utf8_structure) const noexcept {
    return (content == utf8_structure.content);
}


bool Utf8String::operator==(const std::string &string) const noexcept {
    return (this->to_string() == string);
}

 

以上是关于utf8 string的主要内容,如果未能解决你的问题,请参考以下文章

C++ jstring to string (UTF16->UTF8)

C++ jstring to string (UTF16->UTF8)

Python string中删除(过滤)掉emoji表情字符

CSP核心代码片段记录

如何将代码片段存储在 mongodb 中?

iOS 控制台打印unicode 转中文汉字 UTF8String