Merge pull request #12645 from MrPetovan/bug/warnings
Replace HTML regular expression by HTML::extractCharset in ParseUrl::getSiteInfo
This commit is contained in:
commit
1fc7d5ae85
6 changed files with 1684 additions and 21 deletions
|
@ -23,6 +23,7 @@ namespace Friendica\Content\Text;
|
|||
|
||||
use DOMDocument;
|
||||
use DOMXPath;
|
||||
use Friendica\Protocol\HTTP\MediaType;
|
||||
use Friendica\Content\Widget\ContactBlock;
|
||||
use Friendica\Core\Hook;
|
||||
use Friendica\Core\Renderer;
|
||||
|
@ -1055,4 +1056,30 @@ class HTML
|
|||
|
||||
return $result !== false && $result->length > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param DOMDocument $doc
|
||||
* @return string|null Lowercase charset
|
||||
*/
|
||||
public static function extractCharset(DOMDocument $doc): ?string
|
||||
{
|
||||
$xpath = new DOMXPath($doc);
|
||||
|
||||
$expression = "string(//meta[@charset]/@charset)";
|
||||
if ($charset = $xpath->evaluate($expression)) {
|
||||
return strtolower($charset);
|
||||
}
|
||||
|
||||
try {
|
||||
// This expression looks for a meta tag with the http-equiv attribute set to "content-type" ignoring case
|
||||
// whose content attribute contains a "charset" string and returns its value
|
||||
$expression = "string(//meta[@http-equiv][translate(@http-equiv, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = 'content-type'][contains(translate(@content, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'charset')]/@content)";
|
||||
$mediaType = MediaType::fromContentType($xpath->evaluate($expression));
|
||||
if (isset($mediaType->parameters['charset'])) {
|
||||
return strtolower($mediaType->parameters['charset']);
|
||||
}
|
||||
} catch(\InvalidArgumentException $e) {}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -253,12 +253,12 @@ class System
|
|||
$func['database'] = in_array($func['class'], ['Friendica\Database\DBA', 'Friendica\Database\Database']);
|
||||
if (!$previous['database'] || !$func['database']) {
|
||||
$classparts = explode("\\", $func['class']);
|
||||
$callstack[] = array_pop($classparts).'::'.$func['function'] . '(' . $func['line'] . ')';
|
||||
$callstack[] = array_pop($classparts).'::'.$func['function'] . (isset($func['line']) ? ' (' . $func['line'] . ')' : '');
|
||||
$previous = $func;
|
||||
}
|
||||
} elseif (!in_array($func['function'], $ignore)) {
|
||||
$func['database'] = ($func['function'] == 'q');
|
||||
$callstack[] = $func['function'] . '(' . $func['line'] . ')';
|
||||
$callstack[] = $func['function'] . (isset($func['line']) ? ' (' . $func['line'] . ')' : '');
|
||||
$func['class'] = '';
|
||||
$previous = $func;
|
||||
}
|
||||
|
|
237
src/Protocol/HTTP/MediaType.php
Normal file
237
src/Protocol/HTTP/MediaType.php
Normal file
|
@ -0,0 +1,237 @@
|
|||
<?php
|
||||
/**
|
||||
* @copyright Copyright (C) 2010-2023, the Friendica project
|
||||
*
|
||||
* @license GNU AGPL version 3 or any later version
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*/
|
||||
|
||||
namespace Friendica\Protocol\HTTP;
|
||||
|
||||
/**
|
||||
* @see https://httpwg.org/specs/rfc9110.html#media.type
|
||||
*
|
||||
* @property-read string $type
|
||||
* @property-read string $subType
|
||||
* @property-read string $parameters
|
||||
*/
|
||||
final class MediaType
|
||||
{
|
||||
const DQUOTE = '"';
|
||||
const DIGIT = '0-9';
|
||||
const ALPHA = 'a-zA-Z';
|
||||
|
||||
// @see https://www.charset.org/charsets/us-ascii
|
||||
const VCHAR = "\\x21-\\x7E";
|
||||
|
||||
const SYMBOL_NO_DELIM = "!#$%&'*+-.^_`|~";
|
||||
|
||||
const OBSTEXT = "\\x80-\\xFF";
|
||||
|
||||
const QDTEXT = "\t \\x21\\x23-\\x5B\\x5D-\\x7E" . self::OBSTEXT;
|
||||
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
private $type;
|
||||
|
||||
/**
|
||||
* @var @string
|
||||
*/
|
||||
private $subType;
|
||||
|
||||
/**
|
||||
* @var string[]
|
||||
*/
|
||||
private $parameters;
|
||||
|
||||
public function __construct(string $type, string $subType, array $parameters = [])
|
||||
{
|
||||
if (!self::isToken($type)) {
|
||||
throw new \InvalidArgumentException("Type isn't a valid token: " . $type);
|
||||
}
|
||||
|
||||
if (!self::isToken($subType)) {
|
||||
throw new \InvalidArgumentException("Subtype isn't a valid token: " . $subType);
|
||||
}
|
||||
|
||||
foreach ($parameters as $key => $value) {
|
||||
if (!self::isToken($key)) {
|
||||
throw new \InvalidArgumentException("Parameter key isn't a valid token: " . $key);
|
||||
}
|
||||
|
||||
if (!self::isToken($value) && !self::isQuotableString($value)) {
|
||||
throw new \InvalidArgumentException("Parameter value isn't a valid token or a quotable string: " . $value);
|
||||
}
|
||||
}
|
||||
|
||||
$this->type = $type;
|
||||
$this->subType = $subType;
|
||||
$this->parameters = $parameters;
|
||||
}
|
||||
|
||||
public function __get(string $name)
|
||||
{
|
||||
if (!isset($this->$name)) {
|
||||
throw new \InvalidArgumentException('Unknown property ' . $name);
|
||||
}
|
||||
|
||||
return $this->$name;
|
||||
}
|
||||
|
||||
public static function fromContentType(string $contentType): self
|
||||
{
|
||||
if (!$contentType) {
|
||||
throw new \InvalidArgumentException('Provided string is empty');
|
||||
}
|
||||
|
||||
$parts = explode(';', $contentType);
|
||||
$mimeTypeParts = explode('/', trim(array_shift($parts)));
|
||||
if (count($mimeTypeParts) !== 2) {
|
||||
throw new \InvalidArgumentException('Provided string doesn\'t look like a MIME type: ' . $contentType);
|
||||
}
|
||||
|
||||
list($type, $subType) = $mimeTypeParts;
|
||||
|
||||
$parameters = [];
|
||||
foreach ($parts as $parameterString) {
|
||||
if (!trim($parameterString)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$parameterParts = explode('=', trim($parameterString));
|
||||
|
||||
if (count($parameterParts) < 2) {
|
||||
throw new \InvalidArgumentException('Parameter lacks a value: ' . $parameterString);
|
||||
}
|
||||
|
||||
if (count($parameterParts) > 2) {
|
||||
throw new \InvalidArgumentException('Parameter has too many values: ' . $parameterString);
|
||||
}
|
||||
|
||||
list($key, $value) = $parameterParts;
|
||||
|
||||
if (!self::isToken($value) && !self::isQuotedString($value)) {
|
||||
throw new \InvalidArgumentException("Parameter value isn't a valid token or a quoted string: \"" . $value . '"');
|
||||
}
|
||||
|
||||
if (self::isQuotedString($value)) {
|
||||
$value = self::extractQuotedStringValue($value);
|
||||
}
|
||||
|
||||
// Parameter keys are case-insensitive, values are not
|
||||
$parameters[strtolower($key)] = $value;
|
||||
}
|
||||
|
||||
return new self($type, $subType, $parameters);
|
||||
}
|
||||
|
||||
public function __toString(): string
|
||||
{
|
||||
$parameters = $this->parameters;
|
||||
|
||||
array_walk($parameters, function (&$value, $key) {
|
||||
$value = '; ' . $key . '=' . (self::isToken($value) ? $value : '"' . addcslashes($value, '"\\') . '"');
|
||||
});
|
||||
|
||||
return $this->type . '/' . $this->subType . implode($parameters);
|
||||
}
|
||||
|
||||
/**
|
||||
* token = 1*tchar
|
||||
* tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
|
||||
* / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
|
||||
* / DIGIT / ALPHA
|
||||
* ; any VCHAR, except delimiters
|
||||
*
|
||||
* @see https://httpwg.org/specs/rfc9110.html#tokens
|
||||
*
|
||||
* @param string $string
|
||||
* @return false|int
|
||||
*/
|
||||
private static function isToken(string $string)
|
||||
{
|
||||
$symbol = preg_quote(self::SYMBOL_NO_DELIM, '/');
|
||||
$digit = self::DIGIT;
|
||||
$alpha = self::ALPHA;
|
||||
|
||||
$pattern = "/^[$symbol$digit$alpha]+$/";
|
||||
|
||||
return preg_match($pattern, $string);
|
||||
}
|
||||
|
||||
/**
|
||||
* quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE
|
||||
* qdtext = HTAB / SP / %x21 / %x23-5B / %x5D-7E / obs-text
|
||||
*
|
||||
* @see https://httpwg.org/specs/rfc9110.html#quoted.strings
|
||||
*
|
||||
* @param string $string
|
||||
* @return bool
|
||||
*/
|
||||
private static function isQuotedString(string $string): bool
|
||||
{
|
||||
$dquote = self::DQUOTE;
|
||||
|
||||
$vchar = self::VCHAR;
|
||||
|
||||
$obsText = self::OBSTEXT;
|
||||
|
||||
$qdtext = '[' . self::QDTEXT . ']';
|
||||
|
||||
$quotedPair = "\\\\[\t $vchar$obsText]";
|
||||
|
||||
$pattern = "/^$dquote(?:$qdtext|$quotedPair)*$dquote$/";
|
||||
|
||||
return preg_match($pattern, $string);
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the string an extracted quoted string value?
|
||||
*
|
||||
* @param string $string
|
||||
* @return bool
|
||||
*/
|
||||
private static function isQuotableString(string $string): bool
|
||||
{
|
||||
$vchar = self::VCHAR;
|
||||
|
||||
$obsText = self::OBSTEXT;
|
||||
|
||||
$qdtext = '[' . self::QDTEXT . ']';
|
||||
|
||||
$quotedSingle = "[\t $vchar$obsText]";
|
||||
|
||||
$pattern = "/^(?:$qdtext|$quotedSingle)*$/";
|
||||
|
||||
return preg_match($pattern, $string);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts the value from a quoted-string, removing quoted pairs
|
||||
*
|
||||
* @param string $value
|
||||
* @return string
|
||||
*/
|
||||
private static function extractQuotedStringValue(string $value): string
|
||||
{
|
||||
return preg_replace_callback('/^"(.*)"$/', function ($matches) {
|
||||
$vchar = self::VCHAR;
|
||||
$obsText = self::OBSTEXT;
|
||||
return preg_replace("/\\\\([\t $vchar$obsText])/", '$1', $matches[1]);
|
||||
}, $value);
|
||||
}
|
||||
}
|
|
@ -24,6 +24,8 @@ namespace Friendica\Util;
|
|||
use DOMDocument;
|
||||
use DOMXPath;
|
||||
use Friendica\Content\OEmbed;
|
||||
use Friendica\Content\Text\HTML;
|
||||
use Friendica\Protocol\HTTP\MediaType;
|
||||
use Friendica\Core\Hook;
|
||||
use Friendica\Core\Logger;
|
||||
use Friendica\Database\Database;
|
||||
|
@ -283,25 +285,13 @@ class ParseUrl
|
|||
}
|
||||
|
||||
$charset = '';
|
||||
// Look for a charset, first in headers
|
||||
// Expected form: Content-Type: text/html; charset=ISO-8859-4
|
||||
if (preg_match('/charset=([a-z0-9-_.\/]+)/i', $curlResult->getContentType(), $matches)) {
|
||||
$charset = trim(trim(trim(array_pop($matches)), ';,'));
|
||||
} else {
|
||||
// Then in body that gets precedence
|
||||
// Expected forms:
|
||||
// - <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
// - <meta charset="utf-8">
|
||||
// - <meta charset=utf-8>
|
||||
// - <meta charSet="utf-8">
|
||||
// We escape <style> and <script> tags since they can contain irrelevant charset information
|
||||
// (see https://github.com/friendica/friendica/issues/9251#issuecomment-698636806)
|
||||
Strings::performWithEscapedBlocks($body, '#<(?:style|script).*?</(?:style|script)>#ism', function ($body) use (&$charset) {
|
||||
if (preg_match('/charset=["\']?([a-z0-9-_.\/]+)/i', $body, $matches)) {
|
||||
$charset = trim(trim(trim(array_pop($matches)), ';,'));
|
||||
}
|
||||
});
|
||||
}
|
||||
try {
|
||||
// Look for a charset, first in headers
|
||||
$mediaType = MediaType::fromContentType($curlResult->getContentType());
|
||||
if (isset($mediaType->parameters['charset'])) {
|
||||
$charset = $mediaType->parameters['charset'];
|
||||
}
|
||||
} catch(\InvalidArgumentException $e) {}
|
||||
|
||||
$siteinfo['charset'] = $charset;
|
||||
|
||||
|
@ -322,6 +312,8 @@ class ParseUrl
|
|||
$doc = new DOMDocument();
|
||||
@$doc->loadHTML($body);
|
||||
|
||||
$siteinfo['charset'] = HTML::extractCharset($doc) ?? $siteinfo['charset'];
|
||||
|
||||
XML::deleteNode($doc, 'style');
|
||||
XML::deleteNode($doc, 'option');
|
||||
XML::deleteNode($doc, 'h1');
|
||||
|
|
File diff suppressed because one or more lines are too long
150
tests/src/Protocol/HTTP/MediaTypeTest.php
Normal file
150
tests/src/Protocol/HTTP/MediaTypeTest.php
Normal file
|
@ -0,0 +1,150 @@
|
|||
<?php
|
||||
/**
|
||||
* @copyright Copyright (C) 2010-2023, the Friendica project
|
||||
*
|
||||
* @license GNU AGPL version 3 or any later version
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
*/
|
||||
|
||||
namespace Friendica\Test\src\Protocol\HTTP;
|
||||
|
||||
use Friendica\Protocol\HTTP\MediaType;
|
||||
|
||||
class MediaTypeTest extends \PHPUnit\Framework\TestCase
|
||||
{
|
||||
public function dataValid(): array
|
||||
{
|
||||
return [
|
||||
'HTML UTF-8' => [
|
||||
'expected' => new MediaType('text', 'html', ['charset' => 'utf-8']),
|
||||
'content-type' => 'text/html; charset=utf-8',
|
||||
],
|
||||
'HTML Northern Europe' => [
|
||||
'expected' => new MediaType('text', 'html', ['charset' => 'ISO-8859-4']),
|
||||
'content-type' => 'text/html; charset=ISO-8859-4',
|
||||
],
|
||||
'multipart/form-data' => [
|
||||
'expected' => new MediaType('multipart', 'form-data', ['boundary' => '---------------------------974767299852498929531610575']),
|
||||
'content-type' => 'multipart/form-data; boundary=---------------------------974767299852498929531610575',
|
||||
],
|
||||
'Multiple parameters' => [
|
||||
'expected' => new MediaType('application', 'octet-stream', ['charset' => 'ISO-8859-4', 'another' => 'parameter']),
|
||||
'content-type' => 'application/octet-stream; charset=ISO-8859-4 ; another=parameter',
|
||||
],
|
||||
'No parameters' => [
|
||||
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
|
||||
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip',
|
||||
],
|
||||
'No parameters colon' => [
|
||||
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
|
||||
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip;',
|
||||
],
|
||||
'No parameters space colon' => [
|
||||
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
|
||||
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip ;',
|
||||
],
|
||||
'No parameters space colon space' => [
|
||||
'expected' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
|
||||
'content-type' => 'application/vnd.adobe.air-application-installer-package+zip ; ',
|
||||
],
|
||||
'Parameter quoted string' => [
|
||||
'expected' => new MediaType('text', 'html', ['parameter' => 'Quoted string with a space and a "double-quote"']),
|
||||
'content-type' => 'text/html; parameter="Quoted string with a space and a \"double-quote\""',
|
||||
]
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider dataValid
|
||||
*
|
||||
* @param MediaType $expected
|
||||
* @param string $contentType
|
||||
* @return void
|
||||
*/
|
||||
public function testValid(MediaType $expected, string $contentType)
|
||||
{
|
||||
$this->assertEquals($expected, MediaType::fromContentType($contentType));
|
||||
}
|
||||
|
||||
public function dataInvalid(): array
|
||||
{
|
||||
return [
|
||||
'no slash' => ['application'],
|
||||
'two slashes' => ['application/octet/stream'],
|
||||
'parameter no value' => ['application/octet-stream ; parameter'],
|
||||
'parameter too many values' => ['application/octet-stream ; parameter=value1=value2'],
|
||||
'type non token' => ['appli"cation/octet-stream'],
|
||||
'subtype non token' => ['application/octet\-stream'],
|
||||
'parameter name non token' => ['application/octet-stream; para"meter=value'],
|
||||
'parameter value invalid' => ['application/octet-stream; parameter="value"value'],
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider dataInvalid
|
||||
*
|
||||
* @param string $contentType
|
||||
* @return void
|
||||
*/
|
||||
public function testInvalid(string $contentType)
|
||||
{
|
||||
$this->expectException(\InvalidArgumentException::class);
|
||||
|
||||
MediaType::fromContentType($contentType);
|
||||
}
|
||||
|
||||
public function dataToString(): array
|
||||
{
|
||||
return [
|
||||
'HTML UTF-8' => [
|
||||
'content-type' => 'text/html; charset=utf-8',
|
||||
'mediaType' => new MediaType('text', 'html', ['charset' => 'utf-8']),
|
||||
],
|
||||
'HTML Northern Europe' => [
|
||||
'expected' => 'text/html; charset=ISO-8859-4',
|
||||
'mediaType' => new MediaType('text', 'html', ['charset' => 'ISO-8859-4']),
|
||||
],
|
||||
'multipart/form-data' => [
|
||||
'expected' => 'multipart/form-data; boundary=---------------------------974767299852498929531610575',
|
||||
'mediaType' => new MediaType('multipart', 'form-data', ['boundary' => '---------------------------974767299852498929531610575']),
|
||||
],
|
||||
'Multiple parameters' => [
|
||||
'expected' => 'application/octet-stream; charset=ISO-8859-4; another=parameter',
|
||||
'mediaType' => new MediaType('application', 'octet-stream', ['charset' => 'ISO-8859-4', 'another' => 'parameter']),
|
||||
],
|
||||
'No parameters' => [
|
||||
'expected' => 'application/vnd.adobe.air-application-installer-package+zip',
|
||||
'mediaType' => new MediaType('application', 'vnd.adobe.air-application-installer-package+zip'),
|
||||
],
|
||||
'Parameter quoted string' => [
|
||||
'expected' => 'text/html; parameter="Quoted string with a space and a \"double-quote\""',
|
||||
'mediaType' => new MediaType('text', 'html', ['parameter' => 'Quoted string with a space and a "double-quote"']),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider dataToString
|
||||
*
|
||||
* @param string $expected
|
||||
* @param MediaType $mediaType
|
||||
* @return void
|
||||
*/
|
||||
public function testToString(string $expected, MediaType $mediaType)
|
||||
{
|
||||
$this->assertEquals($expected, $mediaType->__toString());
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue