doc/dev/univalue__utffilter_8h_source.html

// Copyright 2016 Wladimir J. van der Laan

// Distributed under the MIT software license, see the accompanying

// file COPYING or https://opensource.org/licenses/mit-license.php.

#ifndef BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H

#define BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H


#include <string>


class JSONUTF8StringFilter {

public:

    explicit JSONUTF8StringFilter(std::string &s)

        : str(s), is_valid(true), codepoint(0), state(0), surpair(0) {}

    // Write single 8-bit char (may be part of UTF-8 sequence)

    void push_back(uint8_t ch) {

        if (state == 0) {

            if (ch < 0x80) // 7-bit ASCII, fast direct pass-through

                str.push_back(ch);

            else if (ch < 0xc0) // Mid-sequence character, invalid in this state

                is_valid = false;

            else if (ch < 0xe0) { // Start of 2-byte sequence

                codepoint = (ch & 0x1f) << 6;

                state = 6;

            } else if (ch < 0xf0) { // Start of 3-byte sequence

                codepoint = (ch & 0x0f) << 12;

                state = 12;

            } else if (ch < 0xf8) { // Start of 4-byte sequence

                codepoint = (ch & 0x07) << 18;

                state = 18;

            } else // Reserved, invalid

                is_valid = false;

        } else {

            if ((ch & 0xc0) != 0x80) // Not a continuation, invalid

                is_valid = false;

            state -= 6;

            codepoint |= (ch & 0x3f) << state;

            if (state == 0) push_back_u(codepoint);

        }

    }

    // Write codepoint directly, possibly collating surrogate pairs

    void push_back_u(unsigned int codepoint_) {

        if (state) // Only accept full codepoints in open state

            is_valid = false;

        if (codepoint_ >= 0xD800 &&

            codepoint_ < 0xDC00) { // First half of surrogate pair

            if (surpair) // Two subsequent surrogate pair openers - fail

                is_valid = false;

            else

                surpair = codepoint_;

        } else if (codepoint_ >= 0xDC00 &&

                   codepoint_ < 0xE000) { // Second half of surrogate pair

            if (surpair) { // Open surrogate pair, expect second half

                // Compute code point from UTF-16 surrogate pair

                append_codepoint(0x10000 | ((surpair - 0xD800) << 10) |

                                 (codepoint_ - 0xDC00));

                surpair = 0;

            } else // Second half doesn't follow a first half - fail

                is_valid = false;

        } else {

            if (surpair) // First half of surrogate pair not followed by second

                         // - fail

                is_valid = false;

            else

                append_codepoint(codepoint_);

        }

    }

    // Check that we're in a state where the string can be ended

    // No open sequences, no open surrogate pairs, etc

    bool finalize() {

        if (state || surpair) is_valid = false;

        return is_valid;

    }


private:

    std::string &str;

    bool is_valid;

    // Current UTF-8 decoding state

    unsigned int codepoint;

    int state; // Top bit to be filled in for next UTF-8 byte, or 0


    // Keep track of the following state to handle the following section of

    // RFC4627:

    //

    //    To escape an extended character that is not in the Basic Multilingual

    //    Plane, the character is represented as a twelve-character sequence,

    //    encoding the UTF-16 surrogate pair.  So, for example, a string

    //    containing only the G clef character (U+1D11E) may be represented as

    //    "\uD834\uDD1E".

    //

    //  Two subsequent \u.... may have to be replaced with one actual codepoint.

    unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0


    void append_codepoint(unsigned int codepoint_) {

        if (codepoint_ <= 0x7f)

            str.push_back((char)codepoint_);

        else if (codepoint_ <= 0x7FF) {

            str.push_back((char)(0xC0 | (codepoint_ >> 6)));

            str.push_back((char)(0x80 | (codepoint_ & 0x3F)));

        } else if (codepoint_ <= 0xFFFF) {

            str.push_back((char)(0xE0 | (codepoint_ >> 12)));

            str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));

            str.push_back((char)(0x80 | (codepoint_ & 0x3F)));

        } else if (codepoint_ <= 0x1FFFFF) {

            str.push_back((char)(0xF0 | (codepoint_ >> 18)));

            str.push_back((char)(0x80 | ((codepoint_ >> 12) & 0x3F)));

            str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));

            str.push_back((char)(0x80 | (codepoint_ & 0x3F)));

        }

    }

};


#endif // BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H

JSONUTF8StringFilter
Filter that generates and validates UTF-8, as well as collates UTF-16 surrogate pairs as specified in...
Definition: univalue_utffilter.h:13

JSONUTF8StringFilter::surpair
unsigned int surpair
Definition: univalue_utffilter.h:94

JSONUTF8StringFilter::is_valid
bool is_valid
Definition: univalue_utffilter.h:79

JSONUTF8StringFilter::codepoint
unsigned int codepoint
Definition: univalue_utffilter.h:81

JSONUTF8StringFilter::finalize
bool finalize()
Definition: univalue_utffilter.h:72

JSONUTF8StringFilter::state
int state
Definition: univalue_utffilter.h:82

JSONUTF8StringFilter::push_back_u
void push_back_u(unsigned int codepoint_)
Definition: univalue_utffilter.h:44

JSONUTF8StringFilter::append_codepoint
void append_codepoint(unsigned int codepoint_)
Definition: univalue_utffilter.h:96

JSONUTF8StringFilter::push_back
void push_back(uint8_t ch)
Definition: univalue_utffilter.h:18

JSONUTF8StringFilter::str
std::string & str
Definition: univalue_utffilter.h:78

JSONUTF8StringFilter::JSONUTF8StringFilter
JSONUTF8StringFilter(std::string &s)
Definition: univalue_utffilter.h:15