/usr/share/bro/base/utils/urls.bro is in bro-common 2.5-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | ##! Functions for URL handling.
## A regular expression for matching and extracting URLs.
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
## A URI, as parsed by :bro:id:`decompose_uri`.
type URI: record {
## The URL's scheme..
scheme: string &optional;
## The location, which could be a domain name or an IP address. Left empty if not
## specified.
netlocation: string;
## Port number, if included in URI.
portnum: count &optional;
## Full including the file name. Will be '/' if there's not path given.
path: string;
## Full file name, including extension, if there is a file name.
file_name: string &optional;
## The base filename, without extension, if there is a file name.
file_base: string &optional;
## The filename's extension, if there is a file name.
file_ext: string &optional;
## A table of all query parameters, mapping their keys to values, if there's a
## query.
params: table[string] of string &optional;
};
## Extracts URLs discovered in arbitrary text.
function find_all_urls(s: string): string_set
{
return find_all(s, url_regex);
}
## Extracts URLs discovered in arbitrary text without
## the URL scheme included.
function find_all_urls_without_scheme(s: string): string_set
{
local urls = find_all_urls(s);
local return_urls: set[string] = set();
for ( url in urls )
{
local no_scheme = sub(url, /^([a-zA-Z\-]{3,5})(:\/\/)/, "");
add return_urls[no_scheme];
}
return return_urls;
}
function decompose_uri(uri: string): URI
{
local parts: string_vec;
local u = URI($netlocation="", $path="/");
local s = uri;
if ( /\?/ in s )
{
u$params = table();
parts = split_string1(s, /\?/);
s = parts[0];
local query = parts[1];
if ( /&/ in query )
{
local opv = split_string(query, /&/);
for ( each in opv )
{
if ( /=/ in opv[each] )
{
parts = split_string1(opv[each], /=/);
u$params[parts[0]] = parts[1];
}
}
}
else if ( /=/ in query )
{
parts = split_string1(query, /=/);
u$params[parts[0]] = parts[1];
}
}
if ( /:\/\// in s )
{
# Parse scheme and remove from s.
parts = split_string1(s, /:\/\//);
u$scheme = parts[0];
s = parts[1];
}
if ( /\// in s )
{
# Parse path and remove from s.
parts = split_string1(s, /\//);
s = parts[0];
u$path = fmt("/%s", parts[1]);
if ( |u$path| > 1 && u$path[|u$path| - 1] != "/" )
{
local last_token = find_last(u$path, /\/.+/);
local full_filename = split_string1(last_token, /\//)[1];
if ( /\./ in full_filename )
{
u$file_name = full_filename;
u$file_base = split_string1(full_filename, /\./)[0];
u$file_ext = split_string1(full_filename, /\./)[1];
}
else
{
u$file_name = full_filename;
u$file_base = full_filename;
}
}
}
if ( /:/ in s )
{
# Parse location and port.
parts = split_string1(s, /:/);
u$netlocation = parts[0];
u$portnum = to_count(parts[1]);
}
else
{
u$netlocation = s;
}
return u;
}
|