/usr/lib/python2.7/dist-packages/swap/webAccess.py is in python-swap 1.2.1-7.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | """Web Access
This module implements some basic bits of the web architecture:
dereferencing a URI to get a document, with content negotiation,
and deciding on the basis of the Internet Content Type what to do with it.
$Id: webAccess.py,v 1.34 2007/08/06 16:13:56 syosi Exp $
Web access functionality building on urllib2
"""
import sys, os
#import urllib
import urllib2, urllib # Python standard
from why import newTopLevelFormula
import uripath # http://www.w3.org/2000/10/swap/uripath.py
import diag
from diag import progress
import notation3 # Parser @@@ Registery of parsers vs content types woudl be better.
from OrderedSequence import indentString
HTTP_Content_Type = 'content-type' #@@ belongs elsewhere?
print_all_file_names = diag.print_all_file_names # for listing test files
class SecurityError(IOError):
pass
# A little code to represent a value that can be set
# and read; a singleton. In essence, this is a little
# prettier than a one element list
def setting(self, val=None):
if val is not None:
self[0] = val
return self[0]
sandBoxed = setting.__get__([False])
def cacheHack(addr):
""" If on a plane, hack remote w3.org access to local access
"""
real = "http://www.w3.org/"
local = "/devel/WWW/"
suffixes = [ "", ".rdf", ".n3" ]
if addr.startswith(real):
rest = local + addr[len(real):]
for s in suffixes:
fn = rest + s
try:
os.stat(fn)
progress("Offline: Using local copy %s" % fn)
return "file://" + fn
except OSError:
continue
return addr
def urlopenForRDF(addr, referer=None):
"""Access the web, with a preference for RDF
"""
return webget(addr,
types=['text/rdf+n3',
'application/rdf+xml'
# ,'application/x-turtle' # Why not ask for turtle?
],
referer = referer)
def webget(addr, referer=None, types=[]):
"""Open a URI for reading; return a file-like object with .headers
cf http://www.w3.org/TR/2004/REC-webarch-20041215/#dereference-uri
"""
if diag.chatty_flag > 7: progress("Accessing: " + addr)
if sandBoxed():
if addr[:5] == 'file:':
raise SecurityError('local file access prohibited')
# addr = cacheHack(addr)
# work around python stdlib bugs with data: URIs
# buggy in 2.4.2 with CStringIO
if addr[:5] == 'data:':
# return open_data(addr)
return urllib.urlopen(addr)
req = urllib2.Request(addr)
if types:
req.add_header('Accept', ','.join(types))
if referer: #consistently misspelt
req.add_header('Referer', referer)
stream = urllib2.urlopen(req)
if print_all_file_names:
diag.file_list.append(addr)
return stream
def load(store, uri=None, openFormula=None, asIfFrom=None, contentType=None,
flags="", referer=None, why=None, topLevel=False):
"""Get and parse document. Guesses format if necessary.
uri: if None, load from standard input.
remember: if 1, store as metadata the relationship between this URI and this formula.
Returns: top-level formula of the parsed document.
Raises: IOError, SyntaxError, DocumentError
This is an independent function, as it is fairly independent
of the store. However, it is natural to call it as a method on the store.
And a proliferation of APIs confuses.
"""
# if referer is None:
# raise RuntimeError("We are trying to force things to include a referer header")
try:
baseURI = uripath.base()
if uri != None:
addr = uripath.join(baseURI, uri) # Make abs from relative
if diag.chatty_flag > 40: progress("Taking input from " + addr)
netStream = urlopenForRDF(addr, referer)
if diag.chatty_flag > 60:
progress(" Headers for %s: %s\n" %(addr, netStream.headers.items()))
receivedContentType = netStream.headers.get(HTTP_Content_Type, None)
else:
if diag.chatty_flag > 40: progress("Taking input from standard input")
addr = uripath.join(baseURI, "STDIN") # Make abs from relative
netStream = sys.stdin
receivedContentType = None
# if diag.chatty_flag > 19: progress("HTTP Headers:" +`netStream.headers`)
# @@How to get at all headers??
# @@ Get sensible net errors and produce dignostics
guess = None
if receivedContentType:
if diag.chatty_flag > 9:
progress("Recieved Content-type: " + `receivedContentType` + " for "+addr)
if receivedContentType.find('xml') >= 0 or (
receivedContentType.find('rdf')>=0
and not (receivedContentType.find('n3')>=0) ):
guess = "application/rdf+xml"
elif receivedContentType.find('n3') >= 0:
guess = "text/rdf+n3"
if guess== None and contentType:
if diag.chatty_flag > 9:
progress("Given Content-type: " + `contentType` + " for "+addr)
if contentType.find('xml') >= 0 or (
contentType.find('rdf') >= 0 and not (contentType.find('n3') >= 0 )):
guess = "application/rdf+xml"
elif contentType.find('n3') >= 0:
guess = "text/rdf+n3"
elif contentType.find('sparql') >= 0 or contentType.find('rq'):
guess = "x-application/sparql"
buffer = netStream.read()
if guess == None:
# can't be XML if it starts with these...
if buffer[0:1] == "#" or buffer[0:7] == "@prefix":
guess = 'text/rdf+n3'
elif buffer[0:6] == 'PREFIX' or buffer[0:4] == 'BASE':
guess = "x-application/sparql"
elif buffer.find('xmlns="') >=0 or buffer.find('xmlns:') >=0: #"
guess = 'application/rdf+xml'
else:
guess = 'text/rdf+n3'
if diag.chatty_flag > 9: progress("Guessed ContentType:" + guess)
except (IOError, OSError):
raise DocumentAccessError(addr, sys.exc_info() )
if asIfFrom == None:
asIfFrom = addr
if openFormula != None:
F = openFormula
else:
F = store.newFormula()
if topLevel:
newTopLevelFormula(F)
import os
if guess == "x-application/sparql":
if diag.chatty_flag > 49: progress("Parsing as SPARQL")
from sparql import sparql_parser
import sparql2cwm
convertor = sparql2cwm.FromSparql(store, F, why=why)
import StringIO
p = sparql_parser.N3Parser(StringIO.StringIO(buffer), sparql_parser.branches, convertor)
F = p.parse(sparql_parser.start).close()
elif guess == 'application/rdf+xml':
if diag.chatty_flag > 49: progress("Parsing as RDF")
# import sax2rdf, xml.sax._exceptions
# p = sax2rdf.RDFXMLParser(store, F, thisDoc=asIfFrom, flags=flags)
if flags == 'rdflib' or int(os.environ.get("CWM_RDFLIB", 0)):
parser = 'rdflib'
flags = ''
else:
parser = os.environ.get("CWM_RDF_PARSER", "sax2rdf")
import rdfxml
p = rdfxml.rdfxmlparser(store, F, thisDoc=asIfFrom, flags=flags,
parser=parser, why=why)
p.feed(buffer)
F = p.close()
else:
assert guess == 'text/rdf+n3'
if diag.chatty_flag > 49: progress("Parsing as N3")
if os.environ.get("CWM_N3_PARSER", 0) == 'n3p':
import n3p_tm
import triple_maker
tm = triple_maker.TripleMaker(formula=F, store=store)
p = n3p_tm.n3p_tm(asIfFrom, tm)
else:
p = notation3.SinkParser(store, F, thisDoc=asIfFrom,flags=flags, why=why)
try:
p.startDoc()
p.feed(buffer)
p.endDoc()
except:
progress("Failed to parse %s" % uri or buffer)
raise
if not openFormula:
F = F.close()
return F
def loadMany(store, uris, openFormula=None):
"""Get, parse and merge serveral documents, given a list of URIs.
Guesses format if necessary.
Returns top-level formula which is the parse result.
Raises IOError, SyntaxError
"""
assert type(uris) is type([])
if openFormula == None: F = store.newFormula()
else: F = openFormula
f = F.uriref()
for u in uris:
F.reopen() # should not be necessary
store.load(u, openFormula=F, remember=0)
return F.close()
# @@@@@@@@@@@@@ Ripped from python2.4/lib/urllib which is buggy
# File "/devel/WWW/2000/10/swap/webAccess.py", line 104, in load
# netStream = urlopenForRDF(addr, referer)
# File "/devel/WWW/2000/10/swap/webAccess.py", line 72, in urlopenForRDF
# return urllib.urlopen(addr)
# File "/sw/lib/python2.4/urllib.py", line 77, in urlopen
# return opener.open(url)
# File "/sw/lib/python2.4/urllib.py", line 185, in open
# return getattr(self, name)(url)
# File "/sw/lib/python2.4/urllib.py", line 559, in open_data
# f.fileno = None # needed for addinfourl
#AttributeError: 'cStringIO.StringI' object has no attribute 'fileno'
# $ cwm 'data:text/rdf+n3;charset=utf-8;base64,QHByZWZpeCBsb2c6IDxodHRwOi8vd3d3LnczLm9yZy8yMDAwLzEwL3N3YXAvbG9nIz4gLgp7fSA9PiB7OmEgOmIgOmN9IC4g'
# Found the bug in python bug traker.
# http://sourceforge.net/tracker/index.php?func=detail&aid=1365984&group_id=5470&atid=105470
# "Fixed in revision 41548 and 41549 (2.4). by birkenfeld"
# It is in effect fixed in python 2.4.4
def open_data(url, data=None):
"""Use "data" URL."""
# ignore POSTed data
#
# syntax of data URLs:
# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
# mediatype := [ type "/" subtype ] *( ";" parameter )
# data := *urlchar
# parameter := attribute "=" value
import mimetools, time
from StringIO import StringIO
try:
[type, data] = url.split(',', 1)
except ValueError:
raise IOError, ('data error', 'bad data URL')
if not type:
type = 'text/plain;charset=US-ASCII'
semi = type.rfind(';')
if semi >= 0 and '=' not in type[semi:]:
encoding = type[semi+1:]
type = type[:semi]
else:
encoding = ''
msg = []
msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
time.gmtime(time.time())))
msg.append('Content-type: %s' % type)
if encoding == 'base64':
import base64
data = base64.decodestring(data)
else:
data = unquote(data)
msg.append('Content-length: %d' % len(data))
msg.append('')
msg.append(data)
msg = '\n'.join(msg)
f = StringIO(msg)
headers = mimetools.Message(f, 0)
f.fileno = None # needed for addinfourl
return urllib.addinfourl(f, headers, url)
#@@@@@@@@@@ Junk - just to keep track iof the interface to sandros stuff and rdflib
def getParser(format, inputURI, workingContext, flags):
"""Return something which can load from a URI in the given format, while
writing to the given store.
"""
r = BecauseOfCommandLine(sys.argv[0]) # @@ add user, host, pid, date time? Privacy!
if format == "rdf" :
touch(_store)
if "l" in flags["rdf"]:
from rdflib2rdf import RDFXMLParser
else:
rdfParserName = os.environ.get("CWM_RDF_PARSER", "sax2rdf")
if rdfParserName == "rdflib2rdf":
from rdflib2rdf import RDFXMLParser
elif rdfParserName == "sax2rdf":
from sax2rdf import RDFXMLParser
else:
raise RuntimeError("Unknown RDF parser: " + rdfParserName)
return RDFXMLParser(_store, workingContext, inputURI,
flags=flags[format], why=r)
elif format == "n3":
touch(_store)
return notation3.SinkParser(_store, openFormula=workingContext,
thisDoc=inputURI, why=r)
else:
need(lxkb)
touch(lxkb)
return LX.language.getParser(language=format,
sink=lxkb,
flags=flags)
class DocumentAccessError(IOError):
def __init__(self, uri, info):
self._uri = uri
self._info = info
def __str__(self):
# See C:\Python16\Doc\ref\try.html or URI to that effect
# reason = `self._info[0]` + " with args: " + `self._info[1]`
reason = indentString(self._info[1].__str__())
return ("Unable to access document <%s>, because:\n%s" % ( self._uri, reason))
|