#!/bin/sh
# Test multibyte locale which is not UTF-8 (ja_JP.shift_jis)
# This is a stateful locale. Same byte value can be either
# a single-byte character, or the second byte of a multibyte
# character.
# Copyright (C) 2016-2022 Free Software Foundation, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
. "${srcdir=.}/testsuite/init.sh"; path_prepend_ ./sed
print_ver_ sed
# If found, LOCALE_JA_SJIS will contain the locale name.
require_ja_shiftjis_locale_
# Ensure the implementation is not buggy (skip otherwise)
require_valid_ja_shiftjis_locale_ "$LOCALE_JA_SJIS"
# This test uses two characters:
# Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
# Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
#
# In SHIFT-JIS locale, these multibyte characters contain
# open/close brackets (ASCII 0x5B/0x5D) as the trailing byte.
#
# See also:
# https://en.wikipedia.org/wiki/Shift_JIS
# http://www.rikai.com/library/kanjitables/kanji_codes.sjis.shtml
# Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
#
# UTF-8: hex: 0xE3 0x82 0xBC
# bin: 11100011 10000010 10111100
#
# Shift-jis hex: 0x83 0x5B
# oct: 203 133
# bin: 10000011 01011011
#
# Conversion example:
# $ printf '\x83\x5B' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
# 0000000 e3 82 bc
# 343 202 274
# 343 202 274
# Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
#
# UTF-8: hex: 0xE3 0x82 0xBE
# bin: 11100011 10000010 10111110
#
# Shift-jis hex: 0x83 0x5D
# oct: 203 135
# bin: 10000011 01011101
#
# Conversion example:
# $ printf '\x83\x5D' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
# 0000000 e3 82 be
# 343 202 276
# 343 202 276
#
#
# Tests 1,2: Test y/// command with multibyte, non-utf8 seqeunce.
# Implmenetation notes: str_append() has special code path for non-utf8 cases.
#
# Test 1: valid multibyte seqeunce
printf 'y/a/\203\133/' > p1 || framework_failure_
echo Xa > in1 || framework_failure_
printf 'X\203\133\n' > exp1 || framework_failure_
LC_ALL="$LOCALE_JA_SJIS" sed -f p1 out1 || fail=1
compare_ exp1 out1 || fail=1
# Test 2: invalid multibyte seqeunce, treated as two single-byte characters.
printf 'y/aa/\203\060/' > p2 || framework_failure_
LC_ALL="$LOCALE_JA_SJIS" sed -f p2 out2 || fail=1
compare_ /dev/null out2 || fail=1
#
# Test 3: multibyte character class with these characters.
#
# Before sed-4.3, snarf_char_class would parse it incorrectly,
# Treating the first closing-bracket as closing the character-class,
# instead of being part of a multibyte sequence.
printf '/[\203]/]/p' > p3 || framework_failure_
LC_ALL="$LOCALE_JA_SJIS" sed -f p3 out3 || fail=1
compare_ /dev/null out3 || fail=1
# Test 4:
# Same as test 3, but with the other multibyte character.
# (this did not cause a failure before sed-4.3, but the code was incorrect).
# Keep this test for code-coverage purposes.
printf '/[\203[/]/p' > p4 || framework_failure_
LC_ALL="$LOCALE_JA_SJIS" sed -f p4 out4 || fail=1
compare_ /dev/null out4 || fail=1
# TODO: Find a locale in which ':.=' can be part of a valid multibyte octet.
#
# snarf_char_class specifically tests for five bytes: ':.=[]' .
# '[' and ']' are tested above, yet '.:=' are not valid as part of a
# multibyte shift-jis sequence.
#
# valid:
# $ printf '\203]' | iconv -f SHIFT-JIS -t utf-8
# $ printf '\203[' | iconv -f SHIFT-JIS -t utf-8
#
# invalid:
# $ printf '\203:' | iconv -f SHIFT-JIS -t utf-8
# iconv: (stdin):1:0: cannot convert
#
# $ printf '\203=' | iconv -f SHIFT-JIS -t utf-8
# iconv: (stdin):1:0: cannot convert
#
# $ printf '\203.' | iconv -f SHIFT-JIS -t utf-8
# iconv: (stdin):0:0: cannot convert
Exit $fail