/* Auxiliary program to test mbrtowc(3) behaviour.
Copyright 2016-2022 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; If not, see . */
/* Test the operating-system's native mbrtowc(3) function,
by feeding it multibyte seqeunces one byte at a time,
and reporting the result.
The program prints the following values after each mbrtowc invocation,
separated by commas:
-2 the octet is contributes to a valid yet incomplete multibyte sequence
in the current locale.
-1 the octet causes an encoding error.
0 the octet represents a NUL byte
1 the octet is a valid single-byte character, OR
completes a valid multibyte sequence.
Because the program invokes mbrtowc(3) byte-by-byte, the reported
result should never be larger than 1.
Example of typical output with UTF-8 encoding
---------------------------------------------
The unicode character 'N-ARY SUMMATION' (U+2211), encoded in UTF-8 as:
hex: 0xE2 0x88 0x91
oct: 342 210 211
Decoding the valid sequence byte-by-byte gives:
$ printf '\342\210\221' | LC_ALL=en_US.UTF-8 test-mbrtowc
-2,-2,1
'\210' is not a valid leading byte in UTF-8,
thus the first byte gives -1, and the 'X' is treated
as a valid single-byte character:
$ printf '\210X' | LC_ALL=en_US.UTF-8 test-mbrtowc
-1,1
'\342' is a valid yet incomplete multibyte sequence.
Passing it to mbrtowc results in value '-2'.
The following value 'X' gives an encoding error '-1'
(as 'X' is not a valid trailing byte in a multibyte UTF-8 sequence):
$ printf '\342X' | LC_ALL=en_US.UTF-8 test-mbrtowc
-2,-1
Detecting implementation bugs in mbrtowc
----------------------------------------
UTF-8 implementation is correct on most operating systems.
Other multibyte locales might present more difficulties.
An example is the Japanese SHIFT-JIS locale under Mac OS X.
NOTE: The locale is 'ja_JP.SJIS' under Mac OS X, 'ja_JP.shiftjis'
under Ubuntu. 'ja_JP.sjis' was also found on some systems.
Using unicode character 'KATAKANA LETTER ZE' (U+30BC)
UTF-8: hex: 0xE3 0x82 0xBC
Shift-jis hex: 0x83 0x5B
oct: 203 133
The following is a valid multibyte sequence in SHIFT-JIS,
the first byte should result in '-2' (valid yet incomplete),
and the second byte should result in '1' (a valid multibyte sequence
completed):
$ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
-2,1
The follwing is an INVALID multibyte sequence in SHIFT-JIS
(The byte ':' is not valid as a second octet).
Buggy implementations will accept this as a valid multibyte sequence:
# NOTE: this result indicates a buggy mbrtowc
$ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
-2,1
A correct implementations should report '-1' for the second byte (i.e.
an encoding error):
$ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
-2,-1
Expected results with correct implementations
---------------------------------------------
In GNU Sed some tests purposely use invalid multibyte sequences
to test sed's behaviour. A buggy implemetation of mbrtowc
would result in false-alarm failures.
The following are expected results in correct implementations:
(locale names are from Mac OS X):
$ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
-2,1
$ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
-2,-1
$ printf '\262C' | LC_ALL=ja_JP.eucJP test-mbrtowc
-2,-1
*/
#include
#include
#include
#include
#include
#include "closeout.h"
#include "error.h"
#include "progname.h"
/* stub replacement for non-standard err(3) */
static int
die (const char *msg)
{
error (0, 0, "%s: error: %s\n", program_name, msg);
exit (EXIT_FAILURE);
}
int
main (int argc, char **argv)
{
int c;
int first = 1;
set_program_name (argv[0]);
if (!setlocale (LC_ALL, ""))
die ("failed to set locale");
while ((c = getchar ()) != EOF)
{
wchar_t wc;
char ch = (unsigned char) c;
int i = (int) mbrtowc (&wc, &ch, 1, NULL);
if (!first)
putchar (',');
first = 0;
printf ("%d", i);
}
if (first)
die ("empty input");
putchar ('\n');
if (ferror (stdin))
die ("read error");
close_stdout ();
exit (EXIT_SUCCESS);
}