From f8b9b7439456c5ab8d95ad3c27f905ebd501685f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sun, 9 Feb 2020 13:04:34 +0100 Subject: [PATCH 1/2] libblkid: Fix UTF-16 support in function blkid_encode_to_utf8() Function blkid_encode_to_utf8() says that is supports BLKID_ENC_UTF16LE and BLKID_ENC_UTF16BE encodings, but it is not truth and supports only UCS-2 (and not full UTF-16). As all places where BLKID_ENC_UTF16LE and BLKID_ENC_UTF16BE is used expects UTF-16 and not UCS-2, this patch changes implementation of encodings BLKID_ENC_UTF16LE and BLKID_ENC_UTF16BE to supports full UTF-16, including surrogate pairs and not only UCS-2. --- libblkid/src/encode.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/libblkid/src/encode.c b/libblkid/src/encode.c index 33d349127..36ad1c956 100644 --- a/libblkid/src/encode.c +++ b/libblkid/src/encode.c @@ -237,7 +237,8 @@ size_t blkid_encode_to_utf8(int enc, unsigned char *dest, size_t len, const unsigned char *src, size_t count) { size_t i, j; - uint16_t c; + uint32_t c; + uint16_t c2; for (j = i = 0; i < count; i++) { if (enc == BLKID_ENC_UTF16LE) { @@ -255,6 +256,17 @@ size_t blkid_encode_to_utf8(int enc, unsigned char *dest, size_t len, } else { return 0; } + if ((enc == BLKID_ENC_UTF16LE || enc == BLKID_ENC_UTF16BE) && + c >= 0xD800 && c <= 0xDBFF && i+2 < count) { + if (enc == BLKID_ENC_UTF16LE) + c2 = (src[i+2] << 8) | src[i+1]; + else + c2 = (src[i+1] << 8) | src[i+2]; + if (c2 >= 0xDC00 && c2 <= 0xDFFF) { + c = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00); + i += 2; + } + } if (c == 0) { dest[j] = '\0'; break; @@ -267,12 +279,19 @@ size_t blkid_encode_to_utf8(int enc, unsigned char *dest, size_t len, break; dest[j++] = (uint8_t) (0xc0 | (c >> 6)); dest[j++] = (uint8_t) (0x80 | (c & 0x3f)); - } else { + } else if (c < 0x10000) { if (j+3 >= len) break; dest[j++] = (uint8_t) (0xe0 | (c >> 12)); dest[j++] = (uint8_t) (0x80 | ((c >> 6) & 0x3f)); dest[j++] = (uint8_t) (0x80 | (c & 0x3f)); + } else { + if (j+4 >= len) + break; + dest[j++] = (uint8_t) (0xf0 | (c >> 18)); + dest[j++] = (uint8_t) (0x80 | ((c >> 12) & 0x3f)); + dest[j++] = (uint8_t) (0x80 | ((c >> 6) & 0x3f)); + dest[j++] = (uint8_t) (0x80 | (c & 0x3f)); } } dest[j] = '\0'; From ef111c0b3e7fbe6394e92485bfeddc66a2137345 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sun, 9 Feb 2020 13:04:39 +0100 Subject: [PATCH 2/2] tests: Add UDF hdd image with emoji label created by mkudffs 2.2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mkudffs 2.2 can create UDF filesystem which label contains Unicode code points above U+FFFF which cannot be represented by UCS-2 encoding and therefore UTF-16 needs to be used (via surrogate pairs). Unicode code points above U+FFFF are encoded in UTF-8 by four bytes and some example is emoji "😀". This is test case to check that libblkid correctly supports UTF-16 decoding. It was created by mkudffs 2.2: $ dd if=/dev/zero of=udf-hdd-mkudffs-2.2.img bs=1M count=10 $ mkudffs -l 😀 udf-hdd-mkudffs-2.2.img --- .../expected/blkid/low-probe-udf-hdd-mkudffs-2.2 | 11 +++++++++++ .../blkid/images-fs/udf-hdd-mkudffs-2.2.img.xz | Bin 0 -> 2396 bytes 2 files changed, 11 insertions(+) create mode 100644 tests/expected/blkid/low-probe-udf-hdd-mkudffs-2.2 create mode 100644 tests/ts/blkid/images-fs/udf-hdd-mkudffs-2.2.img.xz diff --git a/tests/expected/blkid/low-probe-udf-hdd-mkudffs-2.2 b/tests/expected/blkid/low-probe-udf-hdd-mkudffs-2.2 new file mode 100644 index 000000000..a1d9ee16a --- /dev/null +++ b/tests/expected/blkid/low-probe-udf-hdd-mkudffs-2.2 @@ -0,0 +1,11 @@ +ID_FS_BLOCK_SIZE=512 +ID_FS_LABEL=😀 +ID_FS_LABEL_ENC=😀 +ID_FS_LOGICAL_VOLUME_ID=😀 +ID_FS_TYPE=udf +ID_FS_USAGE=filesystem +ID_FS_UUID=5e3d6e3fee58c271 +ID_FS_UUID_ENC=5e3d6e3fee58c271 +ID_FS_VERSION=2.01 +ID_FS_VOLUME_ID=😀 +ID_FS_VOLUME_SET_ID=5e3d6e3fee58c271LinuxUDF diff --git a/tests/ts/blkid/images-fs/udf-hdd-mkudffs-2.2.img.xz b/tests/ts/blkid/images-fs/udf-hdd-mkudffs-2.2.img.xz new file mode 100644 index 0000000000000000000000000000000000000000..bc00729b10a52dfc0df6b9b6df55c6eabb7eefec GIT binary patch literal 2396 zcmexsUKJ6=z`*kC+7>q^21Q001_lO!^ATtM{}*CmiDh8O|NH;{;_d(4?L0(_EE5BF zOwxF^xQ+eiyC-r@m-UZY*)V!bk`t-AsnAWAJud>2QQ>K1c^Ldi{ z;Ymu45A;NGYwxc-760pvlH=`vQTp=^Eq~v$H6+G5r*P+kAE$EH7aA$GonLtRO6}&r z-cQF@3!d|+tTgBJuzIHW<*wT;@ql(wqucUpty6c)F)+?JaOfm2GWYk6<%?fO|GfP4wTmd#q1rq33s&*bR9SHalg!)~g;2Jdw^mk1J*6t0lgs58ibPZ`ALQGjFca zop8uD>qM@i?c2A!E9T}*oPIspcV_v5#inVNTl*&ki}YRkpZW4IkLT@Ed#*`F%LQjL z@TxD^__cD+Gp@{^jVoKD%D=CY@p6hgdhD+F+0}RNob)#9PdMCV&i30QPj<1%g7C9E zKHUGp57-B>i`n>`k9e&zeWmVhtE1wA>Mgu`AD-`?{avJ%=|A2D$=qPpv0E zxvsanlJVW_z|Lz0M?H-iPo29c{Xg|x;}U~{oR+sdkukA<{=fdF+;cZ4&*aYZSzE4E zdH+jH4&Qt7;;s_YV8>;9EobsP*s_INCnqC;)iLpDo1AS?U-GTm_~R4&I96Nn75_^2 zzMx%NYv8`7_NeIp)6#}Bjcm2WIGIJ7ecr2>O;6wX)z8BGYSXGSOZ}ezGm-gtd!j@d z`_}9@i3!RH!u<`w;!{rj_TqTA=8|QNk5|$AAK%~avkBi{$Gr5_Nre^bTPAQ^NiS7C zzo2Bwiq3=0A9Hy76D~DpMlSJ@cmMxyEot&RzEK#J@_#tt*WC zlUiz{0+S|{c*Y#b-8AX>f)nT1S>DaRx76u$hkXn~{{_GGY8q1t_9u~}7t{neH z<8&|m=qTs5v z39`G+ZB&VMXK*?z8`=YPj*Q{8Mx9_bdawumkyx%p;gbsXKBkOE=)wgqlvaF|N;L3+@G8P}*_`mx7%j{`>p$(hd zeIA~8Rbh0yL^bHKXy9eJ=a25{J>FKYv#U%+-16u4^yZQ??Q0a%f~R@?nbyGkn1P?c zrFFLHO$P<`l+Uk4G6R9FBnAfc-yeF`8OJ9H Cv8N6I literal 0 HcmV?d00001