/* Multi-byte characters, Error encodings, and Lengths (MCELs) Copyright 2023-2024 Free Software Foundation, Inc. This file is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . */ /* Written by Paul Eggert. */ /* The macros in this file implement multi-byte character representation and forward iteration through a multi-byte string. They are simpler and can be faster than the mbiter family. However, they do not support obsolescent encodings like CP864, EBCDIC, Johab, and Shift JIS that glibc also does not support, and it is up to the caller to coalesce encoding-error bytes if desired. The mcel_scan function lets code iterate through an array of bytes, supporting character encodings in practical use more simply than using plain mbrtoc32. Instead of this single-byte code: char *p = ..., *lim = ...; for (; p < lim; p++) process (*p); You can use this multi-byte code: char *p = ..., *lim = ...; for (mcel_t g; p < lim; p += g.len) { g = mcel_scan (p, lim); process (g); } You can select from G using G.ch, G.err, and G.len. G is an encoding error if G.err is nonzero, a character otherwise. The mcel_scanz function is similar except it works with a string of unknown but positive length that is terminated with '\0'. Instead of this single-byte code: char *p = ...; for (; *p; p++) process (*p); You can use this multi-byte code: char *p = ...; for (mcel_t g; *p; p += g.len) { g = mcel_scanz (p); process (g); } mcel_scant (P, TERMINATOR) is like mcel_scanz (P) except the string is terminated by TERMINATOR. The C standard says that the TERMINATORs '\0', '\r', '\n', '.', '/' are safe, as they cannot be a part (even a trailing byte) of a multi-byte character. In practice TERMINATOR is safe if 0 <= TERMINATOR <= 0x2f (ASCII '/'). mcel_ch (CH, LEN) and mcel_err (ERR) construct mcel_t values. mcel_cmp (G1, G2) compares two mcel_t values lexicographically by character or by encoding byte value, with encoding bytes sorting after characters. Calls like c32isalpha (G.ch) test G; they return false for encoding errors since calls like c32isalpha (0) return false. Calls like mcel_tocmp (c32tolower, G1, G2) are like mcel_cmp (G1, G2), but transliterate first. Although ISO C and POSIX allow encodings that have shift states or that can produce multiple characters from an indivisible byte sequence, POSIX does not require support for these encodings, they are not in practical use on GNUish platforms, and omitting support for them simplifies the API. */ #ifndef _MCEL_H #define _MCEL_H 1 #if !_GL_CONFIG_H_INCLUDED #error "Please include config.h first." #endif #include #include #include #include /* Pacify GCC re type limits. */ #if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__) # pragma GCC diagnostic ignored "-Wtype-limits" #endif /* The maximum multi-byte character length supported on any platform. This can be less than MB_LEN_MAX because many platforms have a large MB_LEN_MAX to allow for stateful encodings, and mcel does not support these encodings. MCEL_LEN_MAX is enough for UTF-8, EUC, Shift-JIS, GB18030, etc. In all multi-byte encodings supported by glibc, 0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX. */ enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 }; /* Bounds for mcel_t members. */ enum { MCEL_CHAR_MAX = 0x10FFFF }; enum { MCEL_ERR_MIN = 0x80 }; /* mcel_t is a type representing a character CH or an encoding error byte ERR, along with a count of the LEN bytes that represent CH or ERR. If ERR is zero, CH is a valid character and 0 < LEN <= MCEL_LEN_MAX; otherwise ERR is an encoding error byte, MCEL_ERR_MIN <= ERR, CH == 0, and LEN == 1. */ typedef struct { char32_t ch; unsigned char err; unsigned char len; } mcel_t; /* Every multi-byte character length fits in mcel_t's LEN. */ static_assert (MB_LEN_MAX <= UCHAR_MAX); /* Shifting an encoding error byte left by this value suffices to sort encoding errors after characters. */ enum { MCEL_ERR_SHIFT = 14 }; static_assert (MCEL_CHAR_MAX < MCEL_ERR_MIN << MCEL_ERR_SHIFT); /* Unsigned char promotes to int. */ static_assert (UCHAR_MAX <= INT_MAX); /* Bytes have 8 bits, as POSIX requires. */ static_assert (CHAR_BIT == 8); #ifndef _GL_LIKELY /* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */ # define _GL_LIKELY(cond) __builtin_expect ((cond), 1) # define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0) #endif _GL_INLINE_HEADER_BEGIN #ifndef MCEL_INLINE # define MCEL_INLINE _GL_INLINE #endif /* mcel_t constructors. */ MCEL_INLINE mcel_t mcel_ch (char32_t ch, size_t len) { assume (0 < len); assume (len <= MCEL_LEN_MAX); assume (ch <= MCEL_CHAR_MAX); return (mcel_t) {.ch = ch, .len = len}; } MCEL_INLINE mcel_t mcel_err (unsigned char err) { assume (MCEL_ERR_MIN <= err); return (mcel_t) {.err = err, .len = 1}; } /* Compare C1 and C2, with encoding errors sorting after characters. Return <0, 0, >0 for <, =, >. */ MCEL_INLINE int mcel_cmp (mcel_t c1, mcel_t c2) { int ch1 = c1.ch, ch2 = c2.ch; return ((c1.err - c2.err) * (1 << MCEL_ERR_SHIFT)) + (ch1 - ch2); } /* Apply the uchar translator TO to C1 and C2 and compare the results, with encoding errors sorting after characters, Return <0, 0, >0 for <, =, >. */ MCEL_INLINE int mcel_tocmp (wint_t (*to) (wint_t), mcel_t c1, mcel_t c2) { int cmp = mcel_cmp (c1, c2); if (_GL_LIKELY ((c1.err - c2.err) | !cmp)) return cmp; int ch1 = to (c1.ch), ch2 = to (c2.ch); return ch1 - ch2; } /* Whether C represents itself as a Unicode character when it is the first byte of a single- or multi-byte character. These days it is safe to assume ASCII, so do not support obsolescent encodings like CP864, EBCDIC, Johab, and Shift JIS. */ MCEL_INLINE bool mcel_isbasic (char c) { return _GL_LIKELY (0 <= c && c < MCEL_ERR_MIN); } /* With mcel there should be no need for the performance overhead of replacing glibc mbrtoc32, as callers shouldn't care whether the C locale treats a byte with the high bit set as an encoding error. */ #ifdef __GLIBC__ # undef mbrtoc32 #endif /* Scan bytes from P inclusive to LIM exclusive. P must be less than LIM. Return the character or encoding error starting at P. */ MCEL_INLINE mcel_t mcel_scan (char const *p, char const *lim) { /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32. In supported encodings, the first byte of a multi-byte character cannot be an ASCII byte. */ char c = *p; if (mcel_isbasic (c)) return mcel_ch (c, 1); /* An initial mbstate_t; initialization optimized for some platforms. For details about these and other platforms, see wchar.in.h. */ #if defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__) /* Although only a trivial optimization, it's worth it for GNU. */ mbstate_t mbs; mbs.__count = 0; #elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \ || (defined __APPLE__ && defined __MACH__)) /* These platforms have 128-byte mbstate_t. What were they thinking? Initialize just for supported encodings (UTF-8, EUC, etc.). Avoid memset because some compilers generate function call code. */ struct mbhidden { char32_t ch; int utf8_want, euc_want; } _GL_ATTRIBUTE_MAY_ALIAS; union { mbstate_t m; struct mbhidden s; } u; u.s.ch = u.s.utf8_want = u.s.euc_want = 0; # define mbs u.m #elif defined __NetBSD__ /* Experiments on both 32- and 64-bit NetBSD platforms have shown that it doesn't work to clear fewer than 24 bytes. */ struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS; union { mbstate_t m; struct mbhidden s; } u; u.s.a = u.s.b = u.s.c = 0; # define mbs u.m #else /* mbstate_t has unknown structure or is not worth optimizing. */ mbstate_t mbs = {0}; #endif char32_t ch; size_t len = mbrtoc32 (&ch, p, lim - p, &mbs); #undef mbs /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3 is not supported and MB_LEN_MAX is small. */ if (_GL_UNLIKELY ((size_t) -1 / 2 < len)) return mcel_err (c); /* A multi-byte character. LEN must be positive, as *P != '\0' and shift sequences are not supported. */ return mcel_ch (ch, len); } /* Scan bytes from P, a byte sequence terminated by TERMINATOR. If *P == TERMINATOR, scan just that byte; otherwise scan bytes up to but not including TERMINATOR. TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'. Return the character or encoding error starting at P. */ MCEL_INLINE mcel_t mcel_scant (char const *p, char terminator) { /* Handle ASCII quickly for speed. */ if (mcel_isbasic (*p)) return mcel_ch (*p, 1); /* Defer to mcel_scan for non-ASCII. Compute length with code that is typically faster than strnlen. */ char const *lim = p + 1; for (int i = 0; i < MCEL_LEN_MAX - 1; i++) lim += *lim != terminator; return mcel_scan (p, lim); } /* Scan bytes from P, a byte sequence terminated by '\0'. If *P == '\0', scan just that byte; otherwise scan bytes up to but not including '\0'. Return the character or encoding error starting at P. */ MCEL_INLINE mcel_t mcel_scanz (char const *p) { return mcel_scant (p, '\0'); } _GL_INLINE_HEADER_END #endif /* _MCEL_H */