/usr/share/julia/test/unicode/checkstring.jl is in julia-common 0.4.7-6.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | # This file is a part of Julia. License is MIT: http://julialang.org/license
# 11575
# Test invalid sequences
byt = 0x0 # Needs to be defined outside the try block!
try
# Continuation byte not after lead
for byt in 0x80:0xbf
@test_throws UnicodeError Base.checkstring(UInt8[byt])
end
# Test lead bytes
for byt in 0xc0:0xff
# Single lead byte at end of string
@test_throws UnicodeError Base.checkstring(UInt8[byt])
# Lead followed by non-continuation character < 0x80
@test_throws UnicodeError Base.checkstring(UInt8[byt,0])
# Lead followed by non-continuation character > 0xbf
@test_throws UnicodeError Base.checkstring(UInt8[byt,0xc0])
end
# Test overlong 2-byte
for byt in 0x81:0xbf
@test_throws UnicodeError Base.checkstring(UInt8[0xc0,byt])
end
for byt in 0x80:0xbf
@test_throws UnicodeError Base.checkstring(UInt8[0xc1,byt])
end
# Test overlong 3-byte
for byt in 0x80:0x9f
@test_throws UnicodeError Base.checkstring(UInt8[0xe0,byt,0x80])
end
# Test overlong 4-byte
for byt in 0x80:0x8f
@test_throws UnicodeError Base.checkstring(UInt8[0xef,byt,0x80,0x80])
end
# Test 4-byte > 0x10ffff
for byt in 0x90:0xbf
@test_throws UnicodeError Base.checkstring(UInt8[0xf4,byt,0x80,0x80])
end
for byt in 0xf5:0xf7
@test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80])
end
# Test 5-byte
for byt in 0xf8:0xfb
@test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80,0x80])
end
# Test 6-byte
for byt in 0xfc:0xfd
@test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80,0x80,0x80])
end
# Test 7-byte
@test_throws UnicodeError Base.checkstring(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])
# Three and above byte sequences
for byt in 0xe0:0xef
# Lead followed by only 1 continuation byte
@test_throws UnicodeError Base.checkstring(UInt8[byt,0x80])
# Lead ended by non-continuation character < 0x80
@test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0])
# Lead ended by non-continuation character > 0xbf
@test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0xc0])
end
# 3-byte encoded surrogate character(s)
# Single surrogate
@test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80])
# Not followed by surrogate
@test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])
# Trailing surrogate first
@test_throws UnicodeError Base.checkstring(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])
# Followed by lead surrogate
@test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])
# Four byte sequences
for byt in 0xf0:0xf4
# Lead followed by only 2 continuation bytes
@test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80])
# Lead followed by non-continuation character < 0x80
@test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0])
# Lead followed by non-continuation character > 0xbf
@test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0xc0])
end
# Long encoding of 0x01
@test_throws UnicodeError utf8(b"\xf0\x80\x80\x80")
# Test ends of long encoded surrogates
@test_throws UnicodeError utf8(b"\xf0\x8d\xa0\x80")
@test_throws UnicodeError utf8(b"\xf0\x8d\xbf\xbf")
@test_throws UnicodeError Base.checkstring(b"\xf0\x80\x80\x80")
@test Base.checkstring(b"\xc0\x81"; accept_long_char=true) == (1,0x1,0,0,0)
@test Base.checkstring(b"\xf0\x80\x80\x80"; accept_long_char=true) == (1,0x1,0,0,0)
catch exp;
println("Error testing checkstring: $byt, $exp")
throw(exp)
end
# Surrogates
@test_throws UnicodeError Base.checkstring(UInt16[0xd800])
@test_throws UnicodeError Base.checkstring(UInt16[0xdc00])
@test_throws UnicodeError Base.checkstring(UInt16[0xdc00,0xd800])
# Surrogates in UTF-32
@test_throws UnicodeError Base.checkstring(UInt32[0xd800])
@test_throws UnicodeError Base.checkstring(UInt32[0xdc00])
@test_throws UnicodeError Base.checkstring(UInt32[0xdc00,0xd800])
# Characters > 0x10ffff
@test_throws UnicodeError Base.checkstring(UInt32[0x110000])
# Test starting and different position
@test Base.checkstring(UInt32[0x110000, 0x1f596], 2) == (1,0x10,1,0,0)
# Test valid sequences
for (seq, res) in (
(UInt8[0x0], (1,0,0,0,0)), # Nul byte, beginning of ASCII range
(UInt8[0x7f], (1,0,0,0,0)), # End of ASCII range
(UInt8[0xc0,0x80], (1,1,0,0,0)), # Long encoded Nul byte (Modified UTF-8, Java)
(UInt8[0xc2,0x80], (1,2,0,0,1)), # \u80, beginning of Latin1 range
(UInt8[0xc3,0xbf], (1,2,0,0,1)), # \uff, end of Latin1 range
(UInt8[0xc4,0x80], (1,4,0,0,1)), # \u100, beginning of non-Latin1 2-byte range
(UInt8[0xdf,0xbf], (1,4,0,0,1)), # \u7ff, end of non-Latin1 2-byte range
(UInt8[0xe0,0xa0,0x80], (1,8,0,1,0)), # \u800, beginning of 3-byte range
(UInt8[0xed,0x9f,0xbf], (1,8,0,1,0)), # \ud7ff, end of first part of 3-byte range
(UInt8[0xee,0x80,0x80], (1,8,0,1,0)), # \ue000, beginning of second part of 3-byte range
(UInt8[0xef,0xbf,0xbf], (1,8,0,1,0)), # \uffff, end of 3-byte range
(UInt8[0xf0,0x90,0x80,0x80],(1,16,1,0,0)), # \U10000, beginning of 4-byte range
(UInt8[0xf4,0x8f,0xbf,0xbf],(1,16,1,0,0)), # \U10ffff, end of 4-byte range
(UInt8[0xed,0xa0,0x80,0xed,0xb0,0x80], (1,0x30,1,0,0)), # Overlong \U10000, (CESU-8)
(UInt8[0xed,0xaf,0xbf,0xed,0xbf,0xbf], (1,0x30,1,0,0)), # Overlong \U10ffff, (CESU-8)
(UInt16[0x0000], (1,0,0,0,0)), # Nul byte, beginning of ASCII range
(UInt16[0x007f], (1,0,0,0,0)), # End of ASCII range
(UInt16[0x0080], (1,2,0,0,1)), # Beginning of Latin1 range
(UInt16[0x00ff], (1,2,0,0,1)), # End of Latin1 range
(UInt16[0x0100], (1,4,0,0,1)), # Beginning of non-Latin1 2-byte range
(UInt16[0x07ff], (1,4,0,0,1)), # End of non-Latin1 2-byte range
(UInt16[0x0800], (1,8,0,1,0)), # Beginning of 3-byte range
(UInt16[0xd7ff], (1,8,0,1,0)), # End of first part of 3-byte range
(UInt16[0xe000], (1,8,0,1,0)), # Beginning of second part of 3-byte range
(UInt16[0xffff], (1,8,0,1,0)), # End of 3-byte range
(UInt16[0xd800,0xdc00], (1,16,1,0,0)), # \U10000, beginning of 4-byte range
(UInt16[0xdbff,0xdfff], (1,16,1,0,0)), # \U10ffff, end of 4-byte range
(UInt32[0x0000], (1,0,0,0,0)), # Nul byte, beginning of ASCII range
(UInt32[0x007f], (1,0,0,0,0)), # End of ASCII range
(UInt32[0x0080], (1,2,0,0,1)), # Beginning of Latin1 range
(UInt32[0x00ff], (1,2,0,0,1)), # End of Latin1 range
(UInt32[0x0100], (1,4,0,0,1)), # Beginning of non-Latin1 2-byte range
(UInt32[0x07ff], (1,4,0,0,1)), # End of non-Latin1 2-byte range
(UInt32[0x0800], (1,8,0,1,0)), # Beginning of 3-byte range
(UInt32[0xd7ff], (1,8,0,1,0)), # End of first part of 3-byte range
(UInt32[0xe000], (1,8,0,1,0)), # Beginning of second part of 3-byte range
(UInt32[0xffff], (1,8,0,1,0)), # End of 3-byte range
(UInt32[0x10000], (1,16,1,0,0)), # \U10000, beginning of 4-byte range
(UInt32[0x10ffff], (1,16,1,0,0)), # \U10ffff, end of 4-byte range
(UInt32[0xd800,0xdc00], (1,0x30,1,0,0)),# Overlong \U10000, (CESU-8)
(UInt32[0xdbff,0xdfff], (1,0x30,1,0,0)))# Overlong \U10ffff, (CESU-8)
@test Base.checkstring(seq) == res
end
# Test bounds checking
@test_throws BoundsError Base.checkstring(b"abcdef", -10)
@test_throws BoundsError Base.checkstring(b"abcdef", 0)
@test_throws BoundsError Base.checkstring(b"abcdef", 7)
@test_throws BoundsError Base.checkstring(b"abcdef", 3, -10)
@test_throws BoundsError Base.checkstring(b"abcdef", 3, 0)
@test_throws BoundsError Base.checkstring(b"abcdef", 3, 7)
@test_throws ArgumentError Base.checkstring(b"abcdef", 3, 1)
|