/usr/include/pegtl/contrib/unescape.hh is in pegtl-dev 1.3.1-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | // Copyright (c) 2014-2015 Dr. Colin Hirsch and Daniel Frey
// Please see LICENSE for license or visit https://github.com/ColinH/PEGTL/
#ifndef PEGTL_CONTRIB_UNESCAPE_HH
#define PEGTL_CONTRIB_UNESCAPE_HH
#include <string>
#include <cassert>
#include <pegtl/ascii.hh>
#include <pegtl/parse_error.hh>
namespace pegtl
{
namespace unescape
{
struct state
{
std::string unescaped;
};
// Utility functions for the unescape actions.
inline bool utf8_append_utf32( std::string & string, const unsigned utf32 )
{
if ( utf32 <= 0x7f ) {
string += char( utf32 & 0xff );
return true;
}
else if ( utf32 <= 0x7ff ) {
char tmp[] = { char( ( ( utf32 & 0x7c0 ) >> 6 ) | 0xc0 ),
char( ( ( utf32 & 0x03f ) ) | 0x80 ) };
string.append( tmp, sizeof( tmp ) );
return true;
}
else if ( utf32 <= 0xffff ) {
char tmp[] = { char( ( ( utf32 & 0xf000 ) >> 12 ) | 0xe0 ),
char( ( ( utf32 & 0x0fc0 ) >> 6 ) | 0x80 ),
char( ( ( utf32 & 0x003f ) ) | 0x80 ) };
string.append( tmp, sizeof( tmp ) );
return true;
}
else if ( utf32 <= 0x10ffff ) {
char tmp[] = { char( ( ( utf32 & 0x1c0000 ) >> 18 ) | 0xf0 ),
char( ( ( utf32 & 0x03f000 ) >> 12 ) | 0x80 ),
char( ( ( utf32 & 0x000fc0 ) >> 6 ) | 0x80 ),
char( ( ( utf32 & 0x00003f ) ) | 0x80 ) };
string.append( tmp, sizeof( tmp ) );
return true;
}
return false;
}
// This function MUST only be called for characters matching pegtl::ascii::xdigit!
template< typename I >
I unhex_char( const char c )
{
switch ( c ) {
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
return I( c - '0' );
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
return I( c - 'a' + 10 );
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
return I( c - 'A' + 10 );
}
assert( false ); // LCOV_EXCL_LINE
}
template< typename I >
I unhex_string( const char * begin, const char * const end )
{
I r = 0;
while ( begin != end ) {
r <<= 4;
r += unhex_char< I >( *begin++ );
}
return r;
}
// Actions for common unescape situations.
struct append_all
{
template< typename Input, typename State >
static void apply( const Input & in, State & st )
{
st.unescaped.append( in.begin(), in.size() );
}
};
// This function MUST be called for a character matching T which must be pegtl::one< ... >.
template< typename T, char ... Rs >
struct unescape_c
{
template< typename Input, typename State >
static void apply( const Input & in, State & st )
{
assert( in.size() == 1 );
st.unescaped += apply_one( * in.begin(), static_cast< const T * >( nullptr ) );
}
template< char ... Qs >
static char apply_one( const char c, const one< Qs ... > * )
{
static_assert( sizeof ... ( Qs ) == sizeof ... ( Rs ), "size mismatch between escaped characters and their mappings" );
return apply_two( c, { Qs ... }, { Rs ... } );
}
static char apply_two( const char c, const std::initializer_list< char > & q, const std::initializer_list< char > & r )
{
for ( std::size_t i = 0; i < q.size(); ++i ) {
if ( * ( q.begin() + i ) == c ) {
return * ( r.begin() + i );
}
}
assert( false ); // LCOV_EXCL_LINE
}
};
// See examples/unescape.cc to see why the following two actions
// have the convenience of skipping the first input character...
struct unescape_u
{
template< typename Input, typename State >
static void apply( const Input & in, State & st )
{
assert( ! in.empty() ); // First character MUST be present, usually 'u' or 'U'.
if ( ! utf8_append_utf32( st.unescaped, unhex_string< unsigned >( in.begin() + 1, in.end() ) ) ) {
throw parse_error( "invalid escaped unicode code point", in );
}
}
};
struct unescape_x
{
template< typename Input, typename State >
static void apply( const Input & in, State & st )
{
assert( ! in.empty() ); // First character MUST be present, usually 'x'.
st.unescaped += unhex_string< char >( in.begin() + 1, in.end() );
}
};
// Like unescape_u, but (a) assumes 4 hexdigits per code point,
// and (b) accepts multiple consecutive escaped 16-bit values.
// It encodes UTF-16 surrogate pairs as single UTF-8 sequence
// as required for JSON by RFC 7159.
struct unescape_j
{
template< typename Input, typename State >
static void apply( const Input & in, State & st )
{
assert( ( ( in.size() + 1 ) % 6 ) == 0 ); // Expects multiple "\\u1234" with the first backslash already skipped.
for ( const char * b = in.begin() + 1; b < in.end(); b += 6 ) {
const auto c = unhex_string< unsigned >( b, b + 4 );
if ( ( 0xd800 <= c ) && ( c <= 0xdbff ) && ( b + 6 < in.end() ) ) {
const auto d = unhex_string< unsigned >( b + 6, b + 10 );
if ( ( 0xdc00 <= d ) && ( d <= 0xdfff ) ) {
b += 6;
utf8_append_utf32( st.unescaped, ( ( ( c & 0x03ff ) << 10 ) | ( d & 0x03ff ) ) + 0x10000 );
continue;
}
}
utf8_append_utf32( st.unescaped, c );
}
}
};
} // unescape
} // pegtl
#endif
|