Helper functions to work with Unicode. More...

#include <UnicodeUtils.h>

Classes
struct	UtfInserter
	[internal] Small helper class to append bytes to some arbitrary container. Used for Utf8 string building. More...

Static Public Member Functions
template<typename T >
static constexpr T *	GetMaxStringEnd ()
	[internal] Returns the max string end pointer for the given type

static bool	IsASCII (ezUInt32 uiChar)
	Returns whether a character is a pure ASCII character (only the first 7 Bits are used)

static bool	IsUtf8StartByte (char iByte)
	Checks whether the given byte is a start byte in a UTF-8 multi-byte sequence.

static bool	IsUtf8ContinuationByte (char iByte)
	Checks whether the given byte is a byte in a UTF-8 multi-byte sequence.

static ezUInt32	GetUtf8SequenceLength (char iFirstByte)
	Returns the number of bytes that a UTF-8 sequence is in length, which is encoded in the first byte of the sequence.

static ezUInt32	ConvertUtf8ToUtf32 (const char *pFirstChar)
	Converts the UTF-8 character that starts at pFirstChar into a UTF-32 character.

static ezUInt32	GetSizeForCharacterInUtf8 (ezUInt32 uiCharacter)
	Computes how many bytes the character would require, if encoded in UTF-8.

static ezResult	MoveToNextUtf8 (const char *&ref_szUtf8, ezUInt32 uiNumCharacters=1)
	Moves the given string pointer ahead to the next Utf8 character sequence.

static ezResult	MoveToNextUtf8 (const char &ref_szUtf8, const char szUtf8End, ezUInt32 uiNumCharacters=1)
	Moves the given string pointer ahead to the next Utf8 character sequence.

static ezResult	MoveToPriorUtf8 (const char &ref_szUtf8, const char szUtf8Start, ezUInt32 uiNumCharacters=1)
	Moves the given string pointer backwards to the previous Utf8 character sequence.

static bool	IsValidUtf8 (const char szString, const char szStringEnd=GetMaxStringEnd< char >())
	Returns false if the given string does not contain a completely valid Utf8 string.

static bool	SkipUtf8Bom (const char *&ref_szUtf8)
	If the given string starts with a Utf8 Bom, the pointer is incremented behind the Bom, and the function returns true.

static bool	SkipUtf16BomLE (const ezUInt16 *&ref_pUtf16)
	If the given string starts with a Utf16 little endian Bom, the pointer is incremented behind the Bom, and the function returns true.

static bool	SkipUtf16BomBE (const ezUInt16 *&ref_pUtf16)
	If the given string starts with a Utf16 big endian Bom, the pointer is incremented behind the Bom, and the function returns true.

template<typename ByteIterator >
static ezUInt32	DecodeUtf8ToUtf32 (ByteIterator &ref_szUtf8Iterator)
	Decodes the next character from the given Utf8 sequence to Utf32 and increments the iterator as far as necessary.

template<typename UInt16Iterator >
static bool	IsUtf16Surrogate (UInt16Iterator &ref_szUtf16Iterator)
	Characters that cannot be represented in a single utf16 code point need to be split up into two surrogate pairs to form unicode characters beyond the \uFFFF range.

template<typename UInt16Iterator >
static ezUInt32	DecodeUtf16ToUtf32 (UInt16Iterator &ref_szUtf16Iterator)
	Decodes the next character from the given Utf16 sequence to Utf32 and increments the iterator as far as necessary.

template<typename WCharIterator >
static ezUInt32	DecodeWCharToUtf32 (WCharIterator &ref_szWCharIterator)
	Decodes the next character from the given wchar_t sequence to Utf32 and increments the iterator as far as necessary.

template<typename ByteIterator >
static void	EncodeUtf32ToUtf8 (ezUInt32 uiUtf32, ByteIterator &ref_szUtf8Output)
	Encodes the given Utf32 character to Utf8 and writes as many bytes to the output iterator, as necessary.

template<typename UInt16Iterator >
static void	EncodeUtf32ToUtf16 (ezUInt32 uiUtf32, UInt16Iterator &ref_szUtf16Output)
	Encodes the given Utf32 character to Utf16 and writes as many bytes to the output iterator, as necessary.

template<typename WCharIterator >
static void	EncodeUtf32ToWChar (ezUInt32 uiUtf32, WCharIterator &ref_szWCharOutput)
	Encodes the given Utf32 character to wchar_t and writes as many bytes to the output iterator, as necessary.

template<typename Container >
static bool	RepairNonUtf8Text (const char pStartData, const char pEndData, Container &out_result)
	Checks an array of char's, whether it is a valid Utf8 string. If not, it repairs the string, ie by either re-encoding characters or removing them. Writes the result to the desired container type (ezString or ezStringBuilder).

Static Public Attributes
static constexpr ezUInt16	Utf16BomLE = 0xfeff
	Byte Order Mark for Little Endian Utf16 strings.

static constexpr ezUInt16	Utf16BomBE = 0xfffe
	Byte Order Mark for Big Endian Utf16 strings.

Detailed Description

Helper functions to work with Unicode.

Member Function Documentation

◆ MoveToNextUtf8() [1/2]

ezResult ezUnicodeUtils::MoveToNextUtf8	(	const char *&	ref_szUtf8,
		const char *	szUtf8End,
		ezUInt32	uiNumCharacters = `1`
	)

inlinestatic

Moves the given string pointer ahead to the next Utf8 character sequence.

The string may point to an invalid position (in between a character sequence). It may not point to a zero terminator already.

◆ MoveToNextUtf8() [2/2]

ezResult ezUnicodeUtils::MoveToNextUtf8	(	const char *&	ref_szUtf8,
		ezUInt32	uiNumCharacters = `1`
	)

inlinestatic

Moves the given string pointer ahead to the next Utf8 character sequence.

The string may point to an invalid position (in between a character sequence). It may not point to a zero terminator already.

◆ MoveToPriorUtf8()

ezResult ezUnicodeUtils::MoveToPriorUtf8	(	const char *&	ref_szUtf8,
		const char *	szUtf8Start,
		ezUInt32	uiNumCharacters = `1`
	)

inlinestatic

Moves the given string pointer backwards to the previous Utf8 character sequence.

The string may point to an invalid position (in between a character sequence), or even the \0 terminator, as long as there is a valid string before it (and the user knows when to stop).

◆ RepairNonUtf8Text()

template<typename Container >

bool ezUnicodeUtils::RepairNonUtf8Text	(	const char *	pStartData,
		const char *	pEndData,
		Container &	out_result
	)

static

Checks an array of char's, whether it is a valid Utf8 string. If not, it repairs the string, ie by either re-encoding characters or removing them. Writes the result to the desired container type (ezString or ezStringBuilder).

Returns true if the text had to be repaired, false if it was already valid.

Note: That the for include order reasons, the implementation is in StringBuilder_inl.h, so you need to have StringBuilder.h included to use it.

◆ SkipUtf16BomBE()

bool ezUnicodeUtils::SkipUtf16BomBE ( const ezUInt16 *& ref_pUtf16 )

inlinestatic

If the given string starts with a Utf16 big endian Bom, the pointer is incremented behind the Bom, and the function returns true.

Otherwise the pointer is unchanged and false is returned.

◆ SkipUtf16BomLE()

bool ezUnicodeUtils::SkipUtf16BomLE ( const ezUInt16 *& ref_pUtf16 )

inlinestatic

If the given string starts with a Utf16 little endian Bom, the pointer is incremented behind the Bom, and the function returns true.

Otherwise the pointer is unchanged and false is returned.

◆ SkipUtf8Bom()

bool ezUnicodeUtils::SkipUtf8Bom ( const char *& ref_szUtf8 )

inlinestatic

If the given string starts with a Utf8 Bom, the pointer is incremented behind the Bom, and the function returns true.

Otherwise the pointer is unchanged and false is returned.

The documentation for this class was generated from the following files:

/home/runner/work/ezEngine/ezEngine/Code/Engine/Foundation/Strings/UnicodeUtils.h
/home/runner/work/ezEngine/ezEngine/Code/Engine/Foundation/Strings/Implementation/StringBuilder_inl.h
/home/runner/work/ezEngine/ezEngine/Code/Engine/Foundation/Strings/Implementation/UnicodeUtils_inl.h

Classes

Static Public Member Functions

Static Public Attributes

Detailed Description

Member Function Documentation

◆ MoveToNextUtf8() [1/2]

◆ MoveToNextUtf8() [2/2]

◆ MoveToPriorUtf8()

◆ RepairNonUtf8Text()

◆ SkipUtf16BomBE()

◆ SkipUtf16BomLE()

◆ SkipUtf8Bom()