/usr/share/php/kohana3.2/system/utf8/to_unicode.php is in libkohana3.2-core-php 3.2.2-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | <?php defined('SYSPATH') or die('No direct script access.');
/**
* UTF8::to_unicode
*
* @package Kohana
* @author Kohana Team
* @copyright (c) 2007-2012 Kohana Team
* @copyright (c) 2005 Harry Fuecks
* @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
*/
function _to_unicode($str)
{
// Cached expected number of octets after the current octet until the beginning of the next UTF8 character sequence
$m_state = 0;
// Cached Unicode character
$m_ucs4 = 0;
// Cached expected number of octets in the current sequence
$m_bytes = 1;
$out = array();
$len = strlen($str);
for ($i = 0; $i < $len; $i++)
{
$in = ord($str[$i]);
if ($m_state == 0)
{
// When m_state is zero we expect either a US-ASCII character or a multi-octet sequence.
if (0 == (0x80 & $in))
{
// US-ASCII, pass straight through.
$out[] = $in;
$m_bytes = 1;
}
elseif (0xC0 == (0xE0 & $in))
{
// First octet of 2 octet sequence
$m_ucs4 = $in;
$m_ucs4 = ($m_ucs4 & 0x1F) << 6;
$m_state = 1;
$m_bytes = 2;
}
elseif (0xE0 == (0xF0 & $in))
{
// First octet of 3 octet sequence
$m_ucs4 = $in;
$m_ucs4 = ($m_ucs4 & 0x0F) << 12;
$m_state = 2;
$m_bytes = 3;
}
elseif (0xF0 == (0xF8 & $in))
{
// First octet of 4 octet sequence
$m_ucs4 = $in;
$m_ucs4 = ($m_ucs4 & 0x07) << 18;
$m_state = 3;
$m_bytes = 4;
}
elseif (0xF8 == (0xFC & $in))
{
/** First octet of 5 octet sequence.
*
* This is illegal because the encoded codepoint must be either
* (a) not the shortest form or
* (b) outside the Unicode range of 0-0x10FFFF.
* Rather than trying to resynchronize, we will carry on until the end
* of the sequence and let the later error handling code catch it.
**/
$m_ucs4 = $in;
$m_ucs4 = ($m_ucs4 & 0x03) << 24;
$m_state = 4;
$m_bytes = 5;
}
elseif (0xFC == (0xFE & $in))
{
// First octet of 6 octet sequence, see comments for 5 octet sequence.
$m_ucs4 = $in;
$m_ucs4 = ($m_ucs4 & 1) << 30;
$m_state = 5;
$m_bytes = 6;
}
else
{
// Current octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.
trigger_error('UTF8::to_unicode: Illegal sequence identifier in UTF-8 at byte '.$i, E_USER_WARNING);
return FALSE;
}
}
else
{
// When m_state is non-zero, we expect a continuation of the multi-octet sequence
if (0x80 == (0xC0 & $in))
{
// Legal continuation
$shift = ($m_state - 1) * 6;
$tmp = $in;
$tmp = ($tmp & 0x0000003F) << $shift;
$m_ucs4 |= $tmp;
// End of the multi-octet sequence. mUcs4 now contains the final Unicode codepoint to be output
if (0 == --$m_state)
{
// Check for illegal sequences and codepoints
// From Unicode 3.1, non-shortest form is illegal
if (((2 == $m_bytes) AND ($m_ucs4 < 0x0080)) OR
((3 == $m_bytes) AND ($m_ucs4 < 0x0800)) OR
((4 == $m_bytes) AND ($m_ucs4 < 0x10000)) OR
(4 < $m_bytes) OR
// From Unicode 3.2, surrogate characters are illegal
(($m_ucs4 & 0xFFFFF800) == 0xD800) OR
// Codepoints outside the Unicode range are illegal
($m_ucs4 > 0x10FFFF))
{
trigger_error('UTF8::to_unicode: Illegal sequence or codepoint in UTF-8 at byte '.$i, E_USER_WARNING);
return FALSE;
}
if (0xFEFF != $m_ucs4)
{
// BOM is legal but we don't want to output it
$out[] = $m_ucs4;
}
// Initialize UTF-8 cache
$m_state = 0;
$m_ucs4 = 0;
$m_bytes = 1;
}
}
else
{
// ((0xC0 & (*in) != 0x80) AND (m_state != 0))
// Incomplete multi-octet sequence
throw new UTF8_Exception("UTF8::to_unicode: Incomplete multi-octet sequence in UTF-8 at byte ':byte'", array(
':byte' => $i,
));
}
}
}
return $out;
}
|