/usr/share/openvas/scap/xml_split is in openvas-manager-common 6.0.9-2.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | #!/usr/bin/env python
# OpenVAS Manager
# $Id$
# Description: Script to split an xml file in a memory efficient way
#
# Authors:
# Benoit Allard <benoit.allard@greenbone.net>
#
# Copyright:
# Copyright (C) 2014 Greenbone Networks GmbH
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
import os
import xml.parsers.expat
from xml.sax.saxutils import escape
from optparse import OptionParser
from math import log10
# How much data we process at a time
CHUNK_SIZE = 1024 * 1024
# The sequence of element leading us to the current one
path = []
# How far we are in the current file
cur_size = 0
# From how much should we start another file
MAX_SIZE = 1024*1024 # 1Mb
# The current index
cur_idx = 0
# The current file handle we are writing to
cur_file = None
# The format string used to introduce the index in the file to be written
FMT = ".%d"
# The filename we are playing with
out_dir = None
root = None
ext = None
# The xml declaration of the file.
xml_declaration = None
# What was the signature of the last start element
start = None
# if we are currently in the process of changing file
ending = False
def attrs_s(attrs):
""" This generate the XML attributes from an element attribute list """
l = ['']
for i in range(0,len(attrs), 2):
l.append('%s="%s"' % (attrs[i], escape(attrs[i+1])))
return ' '.join(l)
def next_file():
""" This makes the decision to cut the current file and starta new one """
# We only want to split the files between <entry>
if len(path) != 1:
return
global cur_size, ending
if (not ending) and (cur_size > MAX_SIZE):
# size above threshold, and not already ending
global cur_file, cur_idx
print "part %d Done" % cur_idx
ending = True
# Close the current elements
for elem in reversed(path):
end_element(elem[0])
# Close the file
cur_file.close()
# reset the size
cur_size = 0
# Open another file
cur_idx += 1
cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext),
'wt')
if xml_declaration is not None:
cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))
# Start again where we stopped
for elem in path:
start_element(*elem)
# We are done 'ending'
ending = False
def xml_decl(version, encoding, standalone):
global xml_declaration
l = ['version', version, 'encoding', encoding]
if standalone != -1:
l.extend(['standalone', 'yes' if standalone else 'no'])
xml_declaration = l
cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))
def start_element(name, attrs):
""" Called by the parser when he meet a start element """
global cur_size, start
if start is not None:
# Chaining starts after each others
cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
start = (name, attrs)
if ending:
return
cur_size += len(name) + sum(len(k) for k in attrs)
path.append((name, attrs))
def end_element(name):
""" Caled by the parser when he meet an end element """
global cur_size
global start
if start is not None:
# Empty element, good, we did not wrote the start part
cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1])))
else:
# There was some data, close it normaly
cur_file.write('</%s>' % name)
start = None
if ending:
return
elem = path.pop()
assert elem[0] == name
cur_size += len(name)
next_file()
def char_data(data):
""" Called by the parser when he meet data """
global cur_size, start
wroteStart = False
if start is not None:
# The data belong to an element, we should write the start part first
cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
start = None
wroteStart = True
# ``escape`` is too much for us, only & and < ned to be escaped there ...
data = data.replace('&', '&')
data = data.replace('<', '<')
if data == '>':
data = '>'
cur_file.write(data.encode('utf-8'))
cur_size += len(data)
if not wroteStart:
# The data was outside of an element, it could be the right moment to
# make the split
next_file()
def main(filepath, output_dir):
# Create a parser
p = xml.parsers.expat.ParserCreate()
# We want to reproduce the input, so we are interested in the order of the
# attributess
p.ordered_attributes = 1
# Set our callbacks (we are stripping comments out by not defining
# callbacks for them)
p.XmlDeclHandler = xml_decl
p.StartElementHandler = start_element
p.EndElementHandler = end_element
p.CharacterDataHandler = char_data
global cur_file, cur_idx
global out_dir, root, ext
global FMT
FMT = ".%%0%dd" % (int(log10(float(os.path.getsize(filepath)) / MAX_SIZE)) + 1)
out_dir, filename = os.path.split(filepath)
if output_dir is not None:
out_dir = output_dir
root, ext = os.path.splitext(filename)
cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext), 'wt')
with open(filepath, 'rt') as xml_file:
while True:
# Read a chunk
chunk = xml_file.read(CHUNK_SIZE)
if len(chunk) < CHUNK_SIZE:
# End of file
# tell the parser we're done
p.Parse(chunk, 1)
# exit the loop
break
# process the chunk
p.Parse(chunk)
# Don't forget to close our handle
cur_file.close()
print "part %d Done" % cur_idx
if __name__ == "__main__":
parser = OptionParser(usage="usage: %prog [options] XML_FILE")
parser.add_option("-o", "--output-dir",
help="Specify the directory where the xml files will be written" \
"(default to the same directory where the original file is)")
parser.add_option("-M", "--max_size", type="int",
help="Specify the size at which the files should be split (in Kb)")
(options, args) = parser.parse_args()
if len(args) != 1:
parser.error("incorrect number of arguments")
if options.max_size is not None:
MAX_SIZE = options.max_size * 1024
main(args[0], options.output_dir)
|