Implement URL parsing/unparsing per RFC 3986.

- Section 5.3 Component Recomposition in RFC 3986 makes a
  differentiation between undefined components and empty
  components that the built-in parse_url in python does not. This
  patch deals with that issue and ensures, for instance, that
  empty queries and fragments are detected.
This commit is contained in:
Dave Longley 2014-12-03 02:05:03 -05:00
parent fed40914c8
commit 1a9c6bffdd

View file

@ -506,59 +506,45 @@ function jsonld_parse_url($url) {
$url = ''; $url = '';
} }
$rval = parse_url($url); $keys = array(
'href', 'protocol', 'scheme', '?authority', 'authority',
'?auth', 'auth', 'user', 'pass', 'host', '?port', 'port', 'path',
'?query', 'query', '?fragment', 'fragment');
$regex = "/^(([^:\/?#]+):)?(\/\/(((([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(:(\d*))?))?([^?#]*)(\?([^#]*))?(#(.*))?/";
preg_match($regex, $url, $match);
// malformed url $rval = array();
if($rval === false) { $flags = array();
$rval = array(); $len = count($keys);
} for($i = 0; $i < $len; ++$i) {
$key = $keys[$i];
$rval['href'] = $url; if(strpos($key, '?') === 0) {
if(!isset($rval['scheme'])) { $flags[substr($key, 1)] = !empty($match[$i]);
$rval['scheme'] = ''; } else if(!isset($match[$i])) {
$rval['protocol'] = ''; $rval[$key] = null;
} else {
$rval['protocol'] = $rval['scheme'] . ':';
}
if(!isset($rval['host'])) {
$rval['host'] = '';
}
if(!isset($rval['path'])) {
$rval['path'] = '';
}
if(isset($rval['user']) || isset($rval['pass'])) {
$rval['auth'] = '';
if(isset($rval['user'])) {
$rval['auth'] = $rval['user'];
}
if(isset($rval['pass'])) {
$rval['auth'] .= ":{$rval['pass']}";
}
}
// parse authority for unparsed relative network-path reference
if(strpos($rval['href'], ':') === false &&
strpos($rval['href'], '//') === 0 && $rval['host'] === '') {
// must parse authority from pathname
$rval['path'] = substr($rval['path'], 2);
$idx = strpos($rval['path'], '/');
if($idx === false) {
$rval['authority'] = $rval['path'];
$rval['path'] = '';
} else { } else {
$rval['authority'] = substr($rval['path'], 0, $idx); $rval[$key] = $match[$i];
$rval['path'] = substr($rval['path'], $idx);
}
} else {
$rval['authority'] = $rval['host'];
if(isset($rval['port'])) {
$rval['authority'] .= ":{$rval['port']}";
}
if(isset($rval['auth'])) {
$rval['authority'] = "{$rval['auth']}@{$rval['authority']}";
} }
} }
if(!$flags['authority']) {
$rval['authority'] = null;
}
if(!$flags['auth']) {
$rval['auth'] = $rval['user'] = $rval['pass'] = null;
}
if(!$flags['port']) {
$rval['port'] = null;
}
if(!$flags['query']) {
$rval['query'] = null;
}
if(!$flags['fragment']) {
$rval['fragment'] = null;
}
$rval['normalizedPath'] = jsonld_remove_dot_segments( $rval['normalizedPath'] = jsonld_remove_dot_segments(
$rval['path'], $rval['authority'] !== ''); $rval['path'], !!$rval['authority']);
return $rval; return $rval;
} }
@ -628,47 +614,66 @@ function jsonld_prepend_base($base, $iri) {
// parse given IRI // parse given IRI
$rel = jsonld_parse_url($iri); $rel = jsonld_parse_url($iri);
// start hierarchical part // per RFC3986 5.2.2
$hierPart = $base['protocol']; $transform = array('protocol' => $base['protocol']);
if($rel['authority']) {
$hierPart .= "//{$rel['authority']}";
} else if($base['href'] !== '') {
$hierPart .= "//{$base['authority']}";
}
// per RFC3986 normalize if($rel['authority'] !== null) {
$transform['authority'] = $rel['authority'];
// IRI represents an absolute path $transform['path'] = $rel['path'];
if(strpos($rel['path'], '/') === 0) { $transform['query'] = $rel['query'];
$path = $rel['path'];
} else { } else {
$path = $base['path']; $transform['authority'] = $base['authority'];
// append relative path to the end of the last directory from base if($rel['path'] === '') {
if($rel['path'] !== '') { $transform['path'] = $base['path'];
$idx = strrpos($path, '/'); if($rel['query'] !== null) {
$idx = ($idx === false) ? 0 : $idx + 1; $transform['query'] = $rel['query'];
$path = substr($path, 0, $idx); } else {
if(strlen($path) > 0 && substr($path, -1) !== '/') { $transform['query'] = $base['query'];
$path .= '/';
} }
$path .= $rel['path']; } else {
if(strpos($rel['path'], '/') === 0) {
// IRI represents an absolute path
$transform['path'] = $rel['path'];
} else {
// merge paths
$path = $base['path'];
// append relative path to the end of the last directory from base
if($rel['path'] !== '') {
$idx = strrpos($path, '/');
$idx = ($idx === false) ? 0 : $idx + 1;
$path = substr($path, 0, $idx);
if(strlen($path) > 0 && substr($path, -1) !== '/') {
$path .= '/';
}
$path .= $rel['path'];
}
$transform['path'] = $path;
}
$transform['query'] = $rel['query'];
} }
} }
// remove slashes and dots in path // remove slashes and dots in path
$path = jsonld_remove_dot_segments($path, $hierPart !== ''); $transform['path'] = jsonld_remove_dot_segments(
$transform['path'], !!$transform['authority']);
// add query and hash // construct URL
if(isset($rel['query'])) { $rval = $transform['protocol'];
$path .= "?{$rel['query']}"; if($transform['authority'] !== null) {
$rval .= '//' . $transform['authority'];
} }
if(isset($rel['fragment'])) { $rval .= $transform['path'];
$path .= "#{$rel['fragment']}"; if($transform['query'] !== null) {
$rval .= '?' . $transform['query'];
}
if($rel['fragment'] !== null) {
$rval .= '#' . $rel['fragment'];
} }
$rval = $hierPart . $path; // handle empty base
if($rval === '') { if($rval === '') {
$rval = './'; $rval = './';
} }
@ -716,7 +721,7 @@ function jsonld_remove_base($base, $iri) {
// is a hash or query) // is a hash or query)
$base_segments = explode('/', $base['normalizedPath']); $base_segments = explode('/', $base['normalizedPath']);
$iri_segments = explode('/', $rel['normalizedPath']); $iri_segments = explode('/', $rel['normalizedPath']);
$last = (isset($rel['query']) || isset($rel['fragment'])) ? 0 : 1; $last = ($rel['query'] || $rel['fragment']) ? 0 : 1;
while(count($base_segments) > 0 && count($iri_segments) > $last) { while(count($base_segments) > 0 && count($iri_segments) > $last) {
if($base_segments[0] !== $iri_segments[0]) { if($base_segments[0] !== $iri_segments[0]) {
break; break;
@ -740,10 +745,10 @@ function jsonld_remove_base($base, $iri) {
$rval .= implode('/', $iri_segments); $rval .= implode('/', $iri_segments);
// add query and hash // add query and hash
if(isset($rel['query'])) { if($rel['query'] !== null) {
$rval .= "?{$rel['query']}"; $rval .= "?{$rel['query']}";
} }
if(isset($rel['fragment'])) { if($rel['fragment'] !== null) {
$rval .= "#{$rel['fragment']}"; $rval .= "#{$rel['fragment']}";
} }