/usr/share/racket/collects/net/uri-codec.rkt is in racket-common 6.3-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 | #lang racket/base
#|
People used to wonder why semicolons were the default. We then
decided to switch the default back to ampersands --
http://www.w3.org/TR/html401/appendix/notes.html#h-B.2.2
We recommend that HTTP server implementors, and in particular, CGI
implementors support the use of ";" in place of "&" to save authors
the trouble of escaping "&" characters in this manner.
See more in PR8831.
|#
;;; <uri-codec.rkt> ---- En/Decode URLs and form-urlencoded data
;;; Time-stamp: <03/04/25 10:31:31 noel>
;;;
;;; Copyright (C) 2002 by Noel Welsh.
;;;
;;; This file is part of Net.
;;; Net is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU Lesser General Public
;;; License as published by the Free Software Foundation; either
;;; version 2.1 of the License, or (at your option) any later version.
;;; Net is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; Lesser General Public License for more details.
;;; You should have received a copy of the GNU Lesser General Public
;;; License along with Net; if not, write to the Free Software
;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
;;; 02110-1301 USA.
;;; Author: Noel Welsh <noelwelsh@yahoo.com>
;;
;; Commentary:
;; The module provides functions to encode and decode strings using
;; the URI encoding rules given in RFC 2396, and to encode and decode
;; name/value pairs using the application/x-www-form-urlencoded
;; mimetype given the in HTML 4.0 specification. There are minor
;; differences between the two encodings.
;; The URI encoding uses allows a few characters to be represented `as
;; is': a-z, A-Z, 0-9, -, _, ., !, ~, *, ', ( and ). The remaining
;; characters are encoded as %xx, where xx is the hex representation
;; of the integer value of the character (where the mapping
;; character<->integer is determined by US-ASCII if the integer is
;; <128).
;; The encoding, inline with RFC 2396's recommendation, represents a
;; character as is, if possible. The decoding allows any characters
;; to be represented by their hex values, and allows characters to be
;; incorrectly represented `as is'.
;; The rules for the application/x-www-form-urlencoded mimetype given
;; in the HTML 4.0 spec are:
;; 1. Control names and values are escaped. Space characters are
;; replaced by `+', and then reserved characters are escaped as
;; described in [RFC1738], section 2.2: Non-alphanumeric characters
;; are replaced by `%HH', a percent sign and two hexadecimal digits
;; representing the ASCII code of the character. Line breaks are
;; represented as "CR LF" pairs (i.e., `%0D%0A').
;; 2. The control names/values are listed in the order they appear
;; in the document. The name is separated from the value by `=' and
;; name/value pairs are separated from each other by `&'.
;; NB: RFC 2396 supersedes RFC 1738.
;; This differs slightly from the straight encoding in RFC 2396 in
;; that `+' is allowed, and represents a space. We follow this
;; convention, encoding a space as `+' and decoding `+' as a space.
;; There appear to be some brain-dead decoders on the web, so we also
;; encode `!', `~', `'', `(' and ')' using their hex representation.
;; This is the same choice as made by the Java URLEncoder.
;; Draws inspiration from encode-decode.scm by Kurt Normark and a code
;; sample provided by Eli Barzilay
(require racket/string racket/list)
(provide uri-encode uri-decode
uri-path-segment-encode uri-path-segment-decode
uri-userinfo-encode uri-userinfo-decode
uri-unreserved-encode uri-unreserved-decode
uri-path-segment-unreserved-encode uri-path-segment-unreserved-decode
form-urlencoded-encode form-urlencoded-decode
alist->form-urlencoded form-urlencoded->alist
current-alist-separator-mode)
(define (self-map-chars str) (map (λ (ch) (cons ch ch)) (string->list str)))
;; The characters that always map to themselves
(define alphanumeric-mapping
(self-map-chars
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"))
;; Characters that sometimes map to themselves
;; called 'mark' in RFC 3986
(define safe-mapping (self-map-chars "-_.!~*'()"))
;; The strict URI mapping
(define uri-mapping `(,@alphanumeric-mapping ,@safe-mapping))
;; The uri path segment mapping from RFC 3986
(define path-segment-extra-mapping (self-map-chars "@+,=$&:"))
(define uri-path-segment-mapping `(,@uri-mapping ,@path-segment-extra-mapping))
;; from RFC 3986
(define unreserved-mapping `(,@alphanumeric-mapping ,@(self-map-chars "-._~")))
;; The uri path segment mapping from RFC 3986
(define uri-path-segment-unreserved-mapping
`(,@unreserved-mapping ,@path-segment-extra-mapping))
;; from RFC 3986
(define sub-delims-mapping (self-map-chars "!$&'()*+,;="))
;; The uri userinfo mapping from RFC 3986
(define uri-userinfo-mapping
`(,@unreserved-mapping ,@sub-delims-mapping ,@(self-map-chars ":")))
;; The form-urlencoded mapping
(define form-urlencoded-mapping
`(,@(self-map-chars ".-*_") (#\space . #\+) ,@alphanumeric-mapping))
(define (number->hex-string number)
(define (hex n) (string-ref "0123456789ABCDEF" n))
(string #\% (hex (quotient number 16)) (hex (modulo number 16))))
(define ascii-size 128)
;; (listof (cons char char)) -> (values (vectorof string) (vectorof int))
(define (make-codec-tables alist)
(define encoding-table (build-vector ascii-size number->hex-string))
(define decoding-table (build-vector ascii-size values))
(for ([orig+enc (in-list alist)])
(vector-set! encoding-table
(char->integer (car orig+enc))
(string (cdr orig+enc)))
(vector-set! decoding-table
(char->integer (cdr orig+enc))
(char->integer (car orig+enc))))
(values encoding-table decoding-table))
;; vector string -> string
(define (encode table str)
;; First, check for an ASCII string with no conversion needed:
(if (for/and ([char (in-string str)])
(define v (char->integer char))
(and (< v ascii-size)
(let ([s (vector-ref table v)])
(and (= 1 (string-length s))
(eq? char (string-ref s 0))))))
str
(let ([out (open-output-string)])
(for ([byte (in-bytes (string->bytes/utf-8 str))])
(cond [(< byte ascii-size)
(write-string (vector-ref table byte) out)]
[else
(write-string (number->hex-string byte) out)]))
(get-output-string out))))
;; vector string -> string
(define (decode table str)
(define max-ascii (integer->char ascii-size))
(define in (open-input-string str))
(define out (open-output-bytes))
(let loop ()
(define c (read-char in))
(unless (eof-object? c)
(cond [(eqv? c #\%)
(define hex (read-string 2 in))
(define hex-n (and (string? hex) (string->number hex 16)))
(cond [(exact-nonnegative-integer? hex-n) ;; not negative, fractional
;; Note: write as byte to support multi-byte Unicode chars
(write-byte hex-n out)]
[else
;; Pass through failed %-escapes as-is, for compatibility with
;; previous version of code.
(write-char #\% out)
(when (string? hex)
(write-string hex out))])]
[(char<? c max-ascii)
(write-char (integer->char (vector-ref table (char->integer c))) out)]
[else
;; This should probably error, but strings to be decoded might
;; come from misbehaving sources; maybe it's better to add some
;; parameter for a permissive mode; one source of such bad URLs
;; is user-defined strings where the string is entered directly
;; and not properly encoded -- similar justification to
;; browsers accepting unencoded chars in manually entered URLs.
(write-char c out)])
(loop)))
(get-output-string out))
;; Utility for defining codecs
(define-syntax-rule (define-codecs [encoder decoder mapping] ...)
(begin (define-values [encoder decoder]
(let-values ([(v:en v:de) (make-codec-tables mapping)])
(define (encoder str) (encode v:en str))
(define (decoder str) (decode v:de str))
(values encoder decoder)))
...))
;; All of these are string -> string
(define-codecs
[uri-encode uri-decode uri-mapping]
[uri-path-segment-encode uri-path-segment-decode uri-path-segment-mapping]
[uri-userinfo-encode uri-userinfo-decode uri-userinfo-mapping]
[uri-unreserved-encode uri-unreserved-decode unreserved-mapping]
[uri-path-segment-unreserved-encode uri-path-segment-unreserved-decode
uri-path-segment-unreserved-mapping]
[form-urlencoded-encode form-urlencoded-decode form-urlencoded-mapping])
;; listof (cons string string) -> string
;; http://www.w3.org/TR/html401/appendix/notes.html#ampersands-in-uris
;; listof (cons symbol string) -> string
(define (alist->form-urlencoded args)
(string-join
(for/list ([arg (in-list args)])
(define name (form-urlencoded-encode (symbol->string (car arg))))
(define value (and (cdr arg) (form-urlencoded-encode (cdr arg))))
(if value (string-append name "=" value) name))
(if (memq (current-alist-separator-mode) '(semi semi-or-amp)) ";" "&")))
;; string -> listof (cons string string)
;; http://www.w3.org/TR/html401/appendix/notes.html#ampersands-in-uris
(define (form-urlencoded->alist str)
(define sep-regexp
(case (current-alist-separator-mode)
[(semi) #rx"[;]"]
[(amp) #rx"[&]"]
[else #rx"[&;]"]))
(if (equal? "" str) '()
(for/list ([keyval (in-list (regexp-split sep-regexp str))])
;; m = #f => no "=..." part
(define m (regexp-match-positions #rx"=" keyval))
(cons (string->symbol (form-urlencoded-decode
(if m (substring keyval 0 (caar m)) keyval)))
(and m (form-urlencoded-decode (substring keyval (cdar m))))))))
(define current-alist-separator-mode
(make-parameter 'amp-or-semi
(lambda (s)
(unless (memq s '(amp semi amp-or-semi semi-or-amp))
(raise-type-error 'current-alist-separator-mode
"'amp, 'semi, 'amp-or-semi, or 'semi-or-amp"
s))
s)))
;;; uri-codec.rkt ends here
|