ddc
Functions | Variables
CommonLib/utf8xx.cpp File Reference
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include "utf8xx.h"
Include dependency graph for CommonLib/utf8xx.cpp:
This graph shows which files directly or indirectly include this file:

Functions

int u8_seqlen (const utf8str &s, size_t i)
 
size_t u8_toucs (ucs4str &dst, const utf8str &src)
 
ucs4str u8_toucs (const utf8str &src)
 
size_t u8_toutf8 (utf8str &dst, const ucs4str &src)
 
utf8str u8_toutf8 (const ucs4str &src)
 
size_t u8_wc_len (ucs4 ch)
 
size_t u8_ws_len (const ucs4str &src)
 
size_t u8_wc_toutf8 (utf8str &dst, ucs4 ch)
 
utf8str u8_wc_toutf8 (ucs4 ch)
 
size_t u8_offset (const utf8str &s, int charnum)
 
size_t u8_charnum (const utf8str &s, int offset)
 
ucs4 u8_nextchar (const utf8str &s, size_t *i)
 
ucs4 u8_nextcharn (const utf8str &s, size_t slen, size_t *i)
 
size_t u8_strlen (const utf8str &s)
 
void u8_inc (const utf8str &s, size_t *i)
 
void u8_dec (const utf8str &s, size_t *i)
 
int octal_digit (char c)
 
int hex_digit (char c)
 

Variables

static const uint32_t offsetsFromUTF8 [6]
 
static const char trailingBytesForUTF8 [256]
 

Function Documentation

◆ u8_seqlen()

int u8_seqlen ( const utf8str s,
size_t  i 
)

returns length of next utf-8 sequence

References trailingBytesForUTF8.

◆ u8_toucs() [1/2]

size_t u8_toucs ( ucs4str dst,
const utf8str src 
)

convert UTF-8 byte string src to UCS-4 wide character string dst, without error checking. Data is appended to dst.

Warning
only works for valid UTF-8, i.e. no 5- or 6-byte sequences
Parameters
dst= destination UCS-4 string
src= source UTF-8 byte string
Returns
number of characters converted

References offsetsFromUTF8, and trailingBytesForUTF8.

Referenced by hex_digit(), and u8_toucs().

Here is the caller graph for this function:

◆ u8_toucs() [2/2]

ucs4str u8_toucs ( const utf8str src)

convert UTF-8 byte string src to a new UCS-4 string

References u8_toucs().

Here is the call graph for this function:

◆ u8_toutf8() [1/2]

size_t u8_toutf8 ( utf8str dst,
const ucs4str src 
)

convert UCS-4 wide character string to UTF-8 byte string.

Parameters
dst= destination UTF-8 string
src= source UCS-4 string
Returns
number of characters converted

Referenced by u8_toutf8().

Here is the caller graph for this function:

◆ u8_toutf8() [2/2]

utf8str u8_toutf8 ( const ucs4str src)

convenience wrapper: UCS-4 string -> UTF-8 string

References u8_toutf8().

Here is the call graph for this function:

◆ u8_wc_len()

size_t u8_wc_len ( ucs4  ch)

(moo) get number of bytes required for representing a wide character ch in UTF-8. Returns 0 on error.

Referenced by u8_wc_toutf8(), and u8_ws_len().

Here is the caller graph for this function:

◆ u8_ws_len()

size_t u8_ws_len ( const ucs4str src)

(moo) get number of bytes required for representing a wide character string ws in UTF-8

References u8_wc_len().

Here is the call graph for this function:

◆ u8_wc_toutf8() [1/2]

size_t u8_wc_toutf8 ( utf8str dst,
ucs4  ch 
)

append single UCS-4 character to a UTF-8 string

Parameters
dstUTF-8 destination buffer
chUCS-4 character to convert
Returns
number of bytes written to dst (0 <= RETVAL <= UTF8XX_MAXBYTES)

Referenced by hex_digit(), u8_wc_toutf8(), unescapeCString(), unescapeJsonString(), and unescapeUtf8String().

Here is the caller graph for this function:

◆ u8_wc_toutf8() [2/2]

utf8str u8_wc_toutf8 ( ucs4  ch)

convience wrapper: UCS-4 char -> UTF-8 string

References u8_wc_len(), and u8_wc_toutf8().

Here is the call graph for this function:

◆ u8_offset()

size_t u8_offset ( const utf8str s,
int  charnum 
)

(logical) character number to (physical) byte offset

References isutf.

◆ u8_charnum()

size_t u8_charnum ( const utf8str s,
int  offset 
)

(physical) byte offset to (logical) character number

References isutf.

◆ u8_nextchar()

ucs4 u8_nextchar ( const utf8str s,
size_t *  i 
)

read and return next logical character, updating an index variable

References isutf, and offsetsFromUTF8.

Referenced by hex_digit(), and u8_strlen().

Here is the caller graph for this function:

◆ u8_nextcharn()

ucs4 u8_nextcharn ( const utf8str s,
size_t  slen,
size_t *  i 
)

(moo): return next character, updating an index variable which may not exceed length slen

References isutf, and offsetsFromUTF8.

◆ u8_strlen()

size_t u8_strlen ( const utf8str s)

count the number of characters in a UTF-8 string

References u8_nextchar().

Here is the call graph for this function:

◆ u8_inc()

void u8_inc ( const utf8str s,
size_t *  i 
)

move to next character

References isutf.

◆ u8_dec()

void u8_dec ( const utf8str s,
size_t *  i 
)

move to previous character

References isutf.

◆ octal_digit()

int octal_digit ( char  c)

Referenced by hex_digit().

Here is the caller graph for this function:

◆ hex_digit()

int hex_digit ( char  c)

References isutf, octal_digit(), offsetsFromUTF8, u8_nextchar(), u8_toucs(), u8_wc_toutf8(), and u_int32_t.

Here is the call graph for this function:

Variable Documentation

◆ offsetsFromUTF8

const uint32_t offsetsFromUTF8[6]
static
Initial value:
= {
0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
}

Referenced by hex_digit(), u8_nextchar(), u8_nextcharn(), and u8_toucs().

◆ trailingBytesForUTF8

const char trailingBytesForUTF8[256]
static
Initial value:
= {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
}

Referenced by u8_seqlen(), and u8_toucs().