From 70bf75c3424215c4bcec10b208f74253fccfb3b7 Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 15 Mar 2021 22:02:21 +0000 Subject: [PATCH] Support for page JSON-LD based page information --- src/Content/OEmbed.php | 55 ++++--- src/Content/PageInfo.php | 17 +-- src/Util/ParseUrl.php | 313 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 355 insertions(+), 30 deletions(-) diff --git a/src/Content/OEmbed.php b/src/Content/OEmbed.php index bda3b503ae..bb3f6290fe 100644 --- a/src/Content/OEmbed.php +++ b/src/Content/OEmbed.php @@ -143,35 +143,54 @@ class OEmbed DI::cache()->set($cache_key, $json_string, $cache_ttl); } - if ($oembed->type == 'error') { + // Always embed the SSL version + if (!empty($oembed->html)) { + $oembed->html = str_replace(['http://www.youtube.com/', 'http://player.vimeo.com/'], ['https://www.youtube.com/', 'https://player.vimeo.com/'], $oembed->html); + } + + // Improve the OEmbed data with data from OpenGraph, Twitter cards and other sources + $data = ParseUrl::getSiteinfoCached($embedurl, true, false); + if (($oembed->type == 'error') && empty($data['title']) && empty($data['text'])) { return $oembed; } - // Always embed the SSL version - $oembed->html = str_replace(['http://www.youtube.com/', 'http://player.vimeo.com/'], ['https://www.youtube.com/', 'https://player.vimeo.com/'], $oembed->html); - - // If fetching information doesn't work, then improve via internal functions - if ($no_rich_type && ($oembed->type == 'rich')) { - $data = ParseUrl::getSiteinfoCached($embedurl, true, false); + if ($no_rich_type || ($oembed->type == 'error')) { + $oembed->html = ''; $oembed->type = $data['type']; if ($oembed->type == 'photo') { $oembed->url = $data['url']; } + } - if (isset($data['title'])) { - $oembed->title = $data['title']; - } + if (!empty($data['title']) && empty($oembed->title)) { + $oembed->title = $data['title']; + } - if (isset($data['text'])) { - $oembed->description = $data['text']; - } + if (!empty($data['text']) && empty($oembed->description)) { + $oembed->description = $data['text']; + } - if (!empty($data['images'])) { - $oembed->thumbnail_url = $data['images'][0]['src']; - $oembed->thumbnail_width = $data['images'][0]['width']; - $oembed->thumbnail_height = $data['images'][0]['height']; - } + if (!empty($data['publisher']) && empty($oembed->provider_name)) { + $oembed->provider_name = $data['publisher']; + } + + if (!empty($data['publisher_url']) && empty($oembed->provider_url)) { + $oembed->provider_url = $data['publisher_url']; + } + + if (!empty($data['author']) && empty($oembed->author_name)) { + $oembed->author_name = $data['author']; + } + + if (!empty($data['author_url']) && empty($oembed->author_url)) { + $oembed->author_url = $data['author_url']; + } + + if (!empty($data['images']) && empty($oembed->thumbnail_url)) { + $oembed->thumbnail_url = $data['images'][0]['src']; + $oembed->thumbnail_width = $data['images'][0]['width']; + $oembed->thumbnail_height = $data['images'][0]['height']; } Hook::callAll('oembed_fetch_url', $embedurl, $oembed); diff --git a/src/Content/PageInfo.php b/src/Content/PageInfo.php index 5396bc1bbe..806a92a03c 100644 --- a/src/Content/PageInfo.php +++ b/src/Content/PageInfo.php @@ -129,17 +129,12 @@ class PageInfo } // Escape some bad characters - $data['url'] = str_replace(['[', ']'], ['[', ']'], htmlentities($data['url'], ENT_QUOTES, 'UTF-8', false)); - $data['title'] = str_replace(['[', ']'], ['[', ']'], htmlentities($data['title'], ENT_QUOTES, 'UTF-8', false)); + $text = "[attachment"; - $text = "[attachment type='" . $data['type'] . "'"; - - if (!empty($data['url'])) { - $text .= " url='" . $data['url'] . "'"; - } - - if (!empty($data['title'])) { - $text .= " title='" . $data['title'] . "'"; + foreach (['type', 'url', 'title', 'alternative_title', 'publisher', 'publisher_url', 'publisher_image', 'author', 'author_url', 'author_image'] as $field) { + if (!empty($data[$field])) { + $text .= " " . $field . "='" . str_replace(['[', ']'], ['[', ']'], htmlentities($data[$field], ENT_QUOTES, 'UTF-8', false)) . "'"; + } } if (empty($data['text'])) { @@ -167,7 +162,7 @@ class PageInfo } } - $text .= ']' . $data['text'] . '[/attachment]'; + $text .= ']' . str_replace(['[', ']'], ['[', ']'], $data['text']) . '[/attachment]'; $hashtags = ''; if (!empty($data['keywords'])) { diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index da6c88abb1..2661445e7e 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -300,7 +300,6 @@ class ParseUrl @$doc->loadHTML($body); XML::deleteNode($doc, 'style'); - XML::deleteNode($doc, 'script'); XML::deleteNode($doc, 'option'); XML::deleteNode($doc, 'h1'); XML::deleteNode($doc, 'h2'); @@ -392,6 +391,9 @@ class ParseUrl case 'dc.description': $siteinfo['text'] = trim($meta_tag['content']); break; + case 'dc.creator': + $siteinfo['publisher'] = trim($meta_tag['content']); + break; case 'keywords': $keywords = explode(',', $meta_tag['content']); break; @@ -426,12 +428,46 @@ class ParseUrl case 'og:image': $siteinfo['image'] = $meta_tag['content']; break; + case 'og:image:url': + $siteinfo['image'] = $meta_tag['content']; + break; + case 'og:image:secure_url': + $siteinfo['image'] = $meta_tag['content']; + break; case 'og:title': $siteinfo['title'] = trim($meta_tag['content']); break; case 'og:description': $siteinfo['text'] = trim($meta_tag['content']); break; + case 'og:site_name': + $siteinfo['publisher'] = trim($meta_tag['content']); + break; + case 'twitter:description': + $siteinfo['text'] = trim($meta_tag['content']); + break; + case 'twitter:title': + $siteinfo['title'] = trim($meta_tag['content']); + break; + case 'twitter:image': + $siteinfo['image'] = $meta_tag['content']; + break; + } + } + } + + $list = $xpath->query("//script[@type='application/ld+json']"); + foreach ($list as $node) { + if (!empty($node->nodeValue)) { + $nodevalue = html_entity_decode($node->nodeValue, ENT_COMPAT, 'UTF-8'); + if ($jsonld = json_decode($nodevalue, true)) { + if (!empty($jsonld['@graph']) && is_array($jsonld['@graph'])) { + foreach ($jsonld['@graph'] as $part) { + $siteinfo = self::parseJsonLd($siteinfo, $part); + } + } else { + $siteinfo = self::parseJsonLd($siteinfo, $jsonld); + } } } } @@ -470,6 +506,281 @@ class ParseUrl return $siteinfo; } + /** + * Improve the siteinfo with information from the provided JSON-LD information + * @see https://jsonld.com/ + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLd(array $siteinfo, array $jsonld) + { + $type = JsonLD::fetchElement($jsonld, '@type'); + + switch ($type) { + case 'Article': + case 'NewsArticle': + return self::parseJsonLdArticle($siteinfo, $jsonld); + case 'WebPage': + return self::parseJsonLdWebPage($siteinfo, $jsonld); + case 'WebSite': + return self::parseJsonLdWebSite($siteinfo, $jsonld); + case 'Organization': + return self::parseJsonLdWebOrganization($siteinfo, $jsonld); + case 'Person': + return self::parseJsonLdWebPerson($siteinfo, $jsonld); + case 'BreadcrumbList': + case 'Audio': /// @todo Can contain direct media links to audio - can be interesting in the future + case 'VideoObject': + case 'ImageObject': + case 'LiveBlogPosting': + case 'SocialMediaPosting': + // quit silently + return $siteinfo; + default: + Logger::info('Unsupported or unknown type', ['type' => $type, 'url' => $siteinfo['url']]); + return $siteinfo; + } + } + + /** + * Improve the siteinfo with information from the provided JSON-LD information concerning authors and publishers + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdAuthor(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + if (is_array($jsonld['publisher'])) { + $content = JsonLD::fetchElement($jsonld, 'publisher', 'name', '@type', 'Organization'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'publisher', 'url', '@type', 'Organization'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $brand = JsonLD::fetchElement($jsonld, 'publisher', 'brand', '@type', 'Organization'); + if (!empty($brand)) { + $content = JsonLD::fetchElement($brand, 'name', '@type', 'brand'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher'] = trim($content); + } + } + } + + if (is_array($jsonld['author'])) { + $content = JsonLD::fetchElement($jsonld, 'author', 'name', '@type', 'Organization'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'author', 'url', '@type', 'Organization'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'author', 'name', '@type', 'Person'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['author'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'author', 'url', '@type', 'Person'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['author_url'] = trim($content); + } + } + + Logger::info('Fetched author information', ['fetched' => $jsonldinfo]); + + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Improve the siteinfo with information from the provided JSON-LD Article information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdArticle(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'headline'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['title'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'alternativeHeadline'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['alternative_title'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['text'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['image'] = trim($content); + } + +/// @todo Check for the correct separator, also check for dpublicates before adding +// $content = JsonLD::fetchElement($jsonld, 'keywords'); +// if (!empty($content) && is_string($content)) { +// $jsonldinfo['keywords'] = trim($content); +// } + + $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); + + Logger::info('Fetched article information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Improve the siteinfo with information from the provided JSON-LD WebPage information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebPage(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['title'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['text'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); + if (!empty($content)) { + $jsonldinfo['image'] = trim($content); + } + + $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); + + Logger::info('Fetched webpage information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Improve the siteinfo with information from the provided JSON-LD WebSite information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebSite(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['publisher'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['publisher_description'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); + if (!empty($content)) { + $jsonldinfo['image'] = trim($content); + } + + $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); + + Logger::info('Fetched WebSite information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Improve the siteinfo with information from the provided JSON-LD Organization information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebOrganization(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['publisher'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'logo', 'url', '@type', 'ImageObject'); + if (!empty($content)) { + $jsonldinfo['publisher_image'] = trim($content); + } + + Logger::info('Fetched Organization information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Improve the siteinfo with information from the provided JSON-LD Person information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebPerson(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['author'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['author_description'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $jsonldinfo['author_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject'); + if (!empty($content)) { + $jsonldinfo['author_image'] = trim($content); + } + + Logger::info('Fetched Person information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + return array_merge($siteinfo, $jsonldinfo); + } + /** * Convert tags from CSV to an array *