From 70bf75c3424215c4bcec10b208f74253fccfb3b7 Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 15 Mar 2021 22:02:21 +0000 Subject: [PATCH 01/11] Support for page JSON-LD based page information --- src/Content/OEmbed.php | 55 ++++--- src/Content/PageInfo.php | 17 +-- src/Util/ParseUrl.php | 313 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 355 insertions(+), 30 deletions(-) diff --git a/src/Content/OEmbed.php b/src/Content/OEmbed.php index bda3b503a..bb3f6290f 100644 --- a/src/Content/OEmbed.php +++ b/src/Content/OEmbed.php @@ -143,35 +143,54 @@ class OEmbed DI::cache()->set($cache_key, $json_string, $cache_ttl); } - if ($oembed->type == 'error') { + // Always embed the SSL version + if (!empty($oembed->html)) { + $oembed->html = str_replace(['http://www.youtube.com/', 'http://player.vimeo.com/'], ['https://www.youtube.com/', 'https://player.vimeo.com/'], $oembed->html); + } + + // Improve the OEmbed data with data from OpenGraph, Twitter cards and other sources + $data = ParseUrl::getSiteinfoCached($embedurl, true, false); + if (($oembed->type == 'error') && empty($data['title']) && empty($data['text'])) { return $oembed; } - // Always embed the SSL version - $oembed->html = str_replace(['http://www.youtube.com/', 'http://player.vimeo.com/'], ['https://www.youtube.com/', 'https://player.vimeo.com/'], $oembed->html); - - // If fetching information doesn't work, then improve via internal functions - if ($no_rich_type && ($oembed->type == 'rich')) { - $data = ParseUrl::getSiteinfoCached($embedurl, true, false); + if ($no_rich_type || ($oembed->type == 'error')) { + $oembed->html = ''; $oembed->type = $data['type']; if ($oembed->type == 'photo') { $oembed->url = $data['url']; } + } - if (isset($data['title'])) { - $oembed->title = $data['title']; - } + if (!empty($data['title']) && empty($oembed->title)) { + $oembed->title = $data['title']; + } - if (isset($data['text'])) { - $oembed->description = $data['text']; - } + if (!empty($data['text']) && empty($oembed->description)) { + $oembed->description = $data['text']; + } - if (!empty($data['images'])) { - $oembed->thumbnail_url = $data['images'][0]['src']; - $oembed->thumbnail_width = $data['images'][0]['width']; - $oembed->thumbnail_height = $data['images'][0]['height']; - } + if (!empty($data['publisher']) && empty($oembed->provider_name)) { + $oembed->provider_name = $data['publisher']; + } + + if (!empty($data['publisher_url']) && empty($oembed->provider_url)) { + $oembed->provider_url = $data['publisher_url']; + } + + if (!empty($data['author']) && empty($oembed->author_name)) { + $oembed->author_name = $data['author']; + } + + if (!empty($data['author_url']) && empty($oembed->author_url)) { + $oembed->author_url = $data['author_url']; + } + + if (!empty($data['images']) && empty($oembed->thumbnail_url)) { + $oembed->thumbnail_url = $data['images'][0]['src']; + $oembed->thumbnail_width = $data['images'][0]['width']; + $oembed->thumbnail_height = $data['images'][0]['height']; } Hook::callAll('oembed_fetch_url', $embedurl, $oembed); diff --git a/src/Content/PageInfo.php b/src/Content/PageInfo.php index 5396bc1bb..806a92a03 100644 --- a/src/Content/PageInfo.php +++ b/src/Content/PageInfo.php @@ -129,17 +129,12 @@ class PageInfo } // Escape some bad characters - $data['url'] = str_replace(['[', ']'], ['[', ']'], htmlentities($data['url'], ENT_QUOTES, 'UTF-8', false)); - $data['title'] = str_replace(['[', ']'], ['[', ']'], htmlentities($data['title'], ENT_QUOTES, 'UTF-8', false)); + $text = "[attachment"; - $text = "[attachment type='" . $data['type'] . "'"; - - if (!empty($data['url'])) { - $text .= " url='" . $data['url'] . "'"; - } - - if (!empty($data['title'])) { - $text .= " title='" . $data['title'] . "'"; + foreach (['type', 'url', 'title', 'alternative_title', 'publisher', 'publisher_url', 'publisher_image', 'author', 'author_url', 'author_image'] as $field) { + if (!empty($data[$field])) { + $text .= " " . $field . "='" . str_replace(['[', ']'], ['[', ']'], htmlentities($data[$field], ENT_QUOTES, 'UTF-8', false)) . "'"; + } } if (empty($data['text'])) { @@ -167,7 +162,7 @@ class PageInfo } } - $text .= ']' . $data['text'] . '[/attachment]'; + $text .= ']' . str_replace(['[', ']'], ['[', ']'], $data['text']) . '[/attachment]'; $hashtags = ''; if (!empty($data['keywords'])) { diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index da6c88abb..2661445e7 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -300,7 +300,6 @@ class ParseUrl @$doc->loadHTML($body); XML::deleteNode($doc, 'style'); - XML::deleteNode($doc, 'script'); XML::deleteNode($doc, 'option'); XML::deleteNode($doc, 'h1'); XML::deleteNode($doc, 'h2'); @@ -392,6 +391,9 @@ class ParseUrl case 'dc.description': $siteinfo['text'] = trim($meta_tag['content']); break; + case 'dc.creator': + $siteinfo['publisher'] = trim($meta_tag['content']); + break; case 'keywords': $keywords = explode(',', $meta_tag['content']); break; @@ -426,12 +428,46 @@ class ParseUrl case 'og:image': $siteinfo['image'] = $meta_tag['content']; break; + case 'og:image:url': + $siteinfo['image'] = $meta_tag['content']; + break; + case 'og:image:secure_url': + $siteinfo['image'] = $meta_tag['content']; + break; case 'og:title': $siteinfo['title'] = trim($meta_tag['content']); break; case 'og:description': $siteinfo['text'] = trim($meta_tag['content']); break; + case 'og:site_name': + $siteinfo['publisher'] = trim($meta_tag['content']); + break; + case 'twitter:description': + $siteinfo['text'] = trim($meta_tag['content']); + break; + case 'twitter:title': + $siteinfo['title'] = trim($meta_tag['content']); + break; + case 'twitter:image': + $siteinfo['image'] = $meta_tag['content']; + break; + } + } + } + + $list = $xpath->query("//script[@type='application/ld+json']"); + foreach ($list as $node) { + if (!empty($node->nodeValue)) { + $nodevalue = html_entity_decode($node->nodeValue, ENT_COMPAT, 'UTF-8'); + if ($jsonld = json_decode($nodevalue, true)) { + if (!empty($jsonld['@graph']) && is_array($jsonld['@graph'])) { + foreach ($jsonld['@graph'] as $part) { + $siteinfo = self::parseJsonLd($siteinfo, $part); + } + } else { + $siteinfo = self::parseJsonLd($siteinfo, $jsonld); + } } } } @@ -470,6 +506,281 @@ class ParseUrl return $siteinfo; } + /** + * Improve the siteinfo with information from the provided JSON-LD information + * @see https://jsonld.com/ + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLd(array $siteinfo, array $jsonld) + { + $type = JsonLD::fetchElement($jsonld, '@type'); + + switch ($type) { + case 'Article': + case 'NewsArticle': + return self::parseJsonLdArticle($siteinfo, $jsonld); + case 'WebPage': + return self::parseJsonLdWebPage($siteinfo, $jsonld); + case 'WebSite': + return self::parseJsonLdWebSite($siteinfo, $jsonld); + case 'Organization': + return self::parseJsonLdWebOrganization($siteinfo, $jsonld); + case 'Person': + return self::parseJsonLdWebPerson($siteinfo, $jsonld); + case 'BreadcrumbList': + case 'Audio': /// @todo Can contain direct media links to audio - can be interesting in the future + case 'VideoObject': + case 'ImageObject': + case 'LiveBlogPosting': + case 'SocialMediaPosting': + // quit silently + return $siteinfo; + default: + Logger::info('Unsupported or unknown type', ['type' => $type, 'url' => $siteinfo['url']]); + return $siteinfo; + } + } + + /** + * Improve the siteinfo with information from the provided JSON-LD information concerning authors and publishers + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdAuthor(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + if (is_array($jsonld['publisher'])) { + $content = JsonLD::fetchElement($jsonld, 'publisher', 'name', '@type', 'Organization'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'publisher', 'url', '@type', 'Organization'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $brand = JsonLD::fetchElement($jsonld, 'publisher', 'brand', '@type', 'Organization'); + if (!empty($brand)) { + $content = JsonLD::fetchElement($brand, 'name', '@type', 'brand'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher'] = trim($content); + } + } + } + + if (is_array($jsonld['author'])) { + $content = JsonLD::fetchElement($jsonld, 'author', 'name', '@type', 'Organization'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'author', 'url', '@type', 'Organization'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'author', 'name', '@type', 'Person'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['author'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'author', 'url', '@type', 'Person'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['author_url'] = trim($content); + } + } + + Logger::info('Fetched author information', ['fetched' => $jsonldinfo]); + + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Improve the siteinfo with information from the provided JSON-LD Article information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdArticle(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'headline'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['title'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'alternativeHeadline'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['alternative_title'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['text'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['image'] = trim($content); + } + +/// @todo Check for the correct separator, also check for dpublicates before adding +// $content = JsonLD::fetchElement($jsonld, 'keywords'); +// if (!empty($content) && is_string($content)) { +// $jsonldinfo['keywords'] = trim($content); +// } + + $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); + + Logger::info('Fetched article information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Improve the siteinfo with information from the provided JSON-LD WebPage information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebPage(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['title'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['text'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); + if (!empty($content)) { + $jsonldinfo['image'] = trim($content); + } + + $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); + + Logger::info('Fetched webpage information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Improve the siteinfo with information from the provided JSON-LD WebSite information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebSite(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['publisher'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['publisher_description'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); + if (!empty($content)) { + $jsonldinfo['image'] = trim($content); + } + + $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); + + Logger::info('Fetched WebSite information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Improve the siteinfo with information from the provided JSON-LD Organization information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebOrganization(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['publisher'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'logo', 'url', '@type', 'ImageObject'); + if (!empty($content)) { + $jsonldinfo['publisher_image'] = trim($content); + } + + Logger::info('Fetched Organization information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Improve the siteinfo with information from the provided JSON-LD Person information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebPerson(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['author'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['author_description'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $jsonldinfo['author_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject'); + if (!empty($content)) { + $jsonldinfo['author_image'] = trim($content); + } + + Logger::info('Fetched Person information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + return array_merge($siteinfo, $jsonldinfo); + } + /** * Convert tags from CSV to an array * From 0a3d50a270620c15c901dcfc96be808d4a7c1278 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 16 Mar 2021 06:37:43 +0000 Subject: [PATCH 02/11] Adjusted field names --- mod/item.php | 18 ++++++++++++------ src/Content/OEmbed.php | 18 +++++++++--------- src/Content/PageInfo.php | 2 +- src/Util/ParseUrl.php | 26 +++++++++++++------------- 4 files changed, 35 insertions(+), 29 deletions(-) diff --git a/mod/item.php b/mod/item.php index 7c11d7311..467d538f2 100644 --- a/mod/item.php +++ b/mod/item.php @@ -56,6 +56,7 @@ use Friendica\Protocol\Activity; use Friendica\Protocol\Diaspora; use Friendica\Security\Security; use Friendica\Util\DateTimeFormat; +use Friendica\Util\ParseUrl; use Friendica\Worker\Delivery; function item_post(App $a) { @@ -217,12 +218,15 @@ function item_post(App $a) { $attachment_img_width = $_REQUEST['attachment_img_width'] ?? 0; $attachment_img_height = $_REQUEST['attachment_img_height'] ?? 0; - $attachment = [ - 'type' => $attachment_type, - 'title' => $attachment_title, - 'text' => $attachment_text, - 'url' => $attachment_url, - ]; + + // Fetch the basic attachment data + $attachment = ParseUrl::getSiteinfoCached($attachment_url); + + // Overwrite the basic data with possible changes from the frontend + $attachment['type'] = $attachment_type; + $attachment['title'] = $attachment_title; + $attachment['text'] = $attachment_text; + $attachment['url'] = $attachment_url; if (!empty($attachment_img_src)) { $attachment['images'] = [ @@ -232,6 +236,8 @@ function item_post(App $a) { 'height' => $attachment_img_height ] ]; + } else { + unset($attachment['images']); } $att_bbcode = "\n" . PageInfo::getFooterFromData($attachment); diff --git a/src/Content/OEmbed.php b/src/Content/OEmbed.php index bb3f6290f..4e26e45f9 100644 --- a/src/Content/OEmbed.php +++ b/src/Content/OEmbed.php @@ -163,31 +163,31 @@ class OEmbed } } - if (!empty($data['title']) && empty($oembed->title)) { + if (!empty($data['title'])) { $oembed->title = $data['title']; } - if (!empty($data['text']) && empty($oembed->description)) { + if (!empty($data['text'])) { $oembed->description = $data['text']; } - if (!empty($data['publisher']) && empty($oembed->provider_name)) { - $oembed->provider_name = $data['publisher']; + if (!empty($data['publisher_name'])) { + $oembed->provider_name = $data['publisher_name']; } - if (!empty($data['publisher_url']) && empty($oembed->provider_url)) { + if (!empty($data['publisher_url'])) { $oembed->provider_url = $data['publisher_url']; } - if (!empty($data['author']) && empty($oembed->author_name)) { - $oembed->author_name = $data['author']; + if (!empty($data['author_name'])) { + $oembed->author_name = $data['author_name']; } - if (!empty($data['author_url']) && empty($oembed->author_url)) { + if (!empty($data['author_url'])) { $oembed->author_url = $data['author_url']; } - if (!empty($data['images']) && empty($oembed->thumbnail_url)) { + if (!empty($data['images'])) { $oembed->thumbnail_url = $data['images'][0]['src']; $oembed->thumbnail_width = $data['images'][0]['width']; $oembed->thumbnail_height = $data['images'][0]['height']; diff --git a/src/Content/PageInfo.php b/src/Content/PageInfo.php index 806a92a03..776dc15df 100644 --- a/src/Content/PageInfo.php +++ b/src/Content/PageInfo.php @@ -131,7 +131,7 @@ class PageInfo // Escape some bad characters $text = "[attachment"; - foreach (['type', 'url', 'title', 'alternative_title', 'publisher', 'publisher_url', 'publisher_image', 'author', 'author_url', 'author_image'] as $field) { + foreach (['type', 'url', 'title', 'alternative_title', 'publisher_name', 'publisher_url', 'publisher_img', 'author_name', 'author_url', 'author_img'] as $field) { if (!empty($data[$field])) { $text .= " " . $field . "='" . str_replace(['[', ']'], ['[', ']'], htmlentities($data[$field], ENT_QUOTES, 'UTF-8', false)) . "'"; } diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index 2661445e7..8bde05f23 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -392,7 +392,7 @@ class ParseUrl $siteinfo['text'] = trim($meta_tag['content']); break; case 'dc.creator': - $siteinfo['publisher'] = trim($meta_tag['content']); + $siteinfo['publisher_name'] = trim($meta_tag['content']); break; case 'keywords': $keywords = explode(',', $meta_tag['content']); @@ -441,7 +441,7 @@ class ParseUrl $siteinfo['text'] = trim($meta_tag['content']); break; case 'og:site_name': - $siteinfo['publisher'] = trim($meta_tag['content']); + $siteinfo['publisher_name'] = trim($meta_tag['content']); break; case 'twitter:description': $siteinfo['text'] = trim($meta_tag['content']); @@ -555,10 +555,10 @@ class ParseUrl { $jsonldinfo = []; - if (is_array($jsonld['publisher'])) { + if (!empty($jsonld['publisher']) && is_array($jsonld['publisher'])) { $content = JsonLD::fetchElement($jsonld, 'publisher', 'name', '@type', 'Organization'); if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher'] = trim($content); + $jsonldinfo['publisher_name'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'publisher', 'url', '@type', 'Organization'); @@ -570,15 +570,15 @@ class ParseUrl if (!empty($brand)) { $content = JsonLD::fetchElement($brand, 'name', '@type', 'brand'); if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher'] = trim($content); + $jsonldinfo['publisher_name'] = trim($content); } } } - if (is_array($jsonld['author'])) { + if (!empty($jsonld['author']) && is_array($jsonld['author'])) { $content = JsonLD::fetchElement($jsonld, 'author', 'name', '@type', 'Organization'); if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher'] = trim($content); + $jsonldinfo['publisher_name'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'author', 'url', '@type', 'Organization'); @@ -588,7 +588,7 @@ class ParseUrl $content = JsonLD::fetchElement($jsonld, 'author', 'name', '@type', 'Person'); if (!empty($content) && is_string($content)) { - $jsonldinfo['author'] = trim($content); + $jsonldinfo['author_name'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'author', 'url', '@type', 'Person'); @@ -692,7 +692,7 @@ class ParseUrl $content = JsonLD::fetchElement($jsonld, 'name'); if (!empty($content)) { - $jsonldinfo['publisher'] = trim($content); + $jsonldinfo['publisher_name'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'description'); @@ -729,7 +729,7 @@ class ParseUrl $content = JsonLD::fetchElement($jsonld, 'name'); if (!empty($content)) { - $jsonldinfo['publisher'] = trim($content); + $jsonldinfo['publisher_name'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'url'); @@ -739,7 +739,7 @@ class ParseUrl $content = JsonLD::fetchElement($jsonld, 'logo', 'url', '@type', 'ImageObject'); if (!empty($content)) { - $jsonldinfo['publisher_image'] = trim($content); + $jsonldinfo['publisher_img'] = trim($content); } Logger::info('Fetched Organization information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); @@ -759,7 +759,7 @@ class ParseUrl $content = JsonLD::fetchElement($jsonld, 'name'); if (!empty($content)) { - $jsonldinfo['author'] = trim($content); + $jsonldinfo['author_name'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'description'); @@ -774,7 +774,7 @@ class ParseUrl $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject'); if (!empty($content)) { - $jsonldinfo['author_image'] = trim($content); + $jsonldinfo['author_img'] = trim($content); } Logger::info('Fetched Person information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); From d498d1520027f005a278f19daad6c0486f74abe4 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 16 Mar 2021 07:04:16 +0000 Subject: [PATCH 03/11] Avoid double fetches --- src/Content/OEmbed.php | 72 ++++++++++++++++++++++-------------------- src/Util/ParseUrl.php | 20 +++++++++--- 2 files changed, 54 insertions(+), 38 deletions(-) diff --git a/src/Content/OEmbed.php b/src/Content/OEmbed.php index 4e26e45f9..bf3c113b7 100644 --- a/src/Content/OEmbed.php +++ b/src/Content/OEmbed.php @@ -62,11 +62,12 @@ class OEmbed * * @param string $embedurl The URL from which the data should be fetched. * @param bool $no_rich_type If set to true rich type content won't be fetched. + * @param bool $use_parseurl Use the "ParseUrl" functionality to add additional data * * @return \Friendica\Object\OEmbed * @throws \Friendica\Network\HTTPException\InternalServerErrorException */ - public static function fetchURL($embedurl, $no_rich_type = false) + public static function fetchURL($embedurl, bool $no_rich_type = false, bool $use_parseurl = true) { $embedurl = trim($embedurl, '\'"'); @@ -149,48 +150,51 @@ class OEmbed } // Improve the OEmbed data with data from OpenGraph, Twitter cards and other sources - $data = ParseUrl::getSiteinfoCached($embedurl, true, false); - if (($oembed->type == 'error') && empty($data['title']) && empty($data['text'])) { - return $oembed; - } + if ($use_parseurl) { + $data = ParseUrl::getSiteinfoCached($embedurl, true, false); - if ($no_rich_type || ($oembed->type == 'error')) { - $oembed->html = ''; - $oembed->type = $data['type']; - - if ($oembed->type == 'photo') { - $oembed->url = $data['url']; + if (($oembed->type == 'error') && empty($data['title']) && empty($data['text'])) { + return $oembed; } - } - if (!empty($data['title'])) { - $oembed->title = $data['title']; - } + if ($no_rich_type || ($oembed->type == 'error')) { + $oembed->html = ''; + $oembed->type = $data['type']; - if (!empty($data['text'])) { - $oembed->description = $data['text']; - } + if ($oembed->type == 'photo') { + $oembed->url = $data['url']; + } + } - if (!empty($data['publisher_name'])) { - $oembed->provider_name = $data['publisher_name']; - } + if (!empty($data['title'])) { + $oembed->title = $data['title']; + } - if (!empty($data['publisher_url'])) { - $oembed->provider_url = $data['publisher_url']; - } + if (!empty($data['text'])) { + $oembed->description = $data['text']; + } - if (!empty($data['author_name'])) { - $oembed->author_name = $data['author_name']; - } + if (!empty($data['publisher_name'])) { + $oembed->provider_name = $data['publisher_name']; + } - if (!empty($data['author_url'])) { - $oembed->author_url = $data['author_url']; - } + if (!empty($data['publisher_url'])) { + $oembed->provider_url = $data['publisher_url']; + } - if (!empty($data['images'])) { - $oembed->thumbnail_url = $data['images'][0]['src']; - $oembed->thumbnail_width = $data['images'][0]['width']; - $oembed->thumbnail_height = $data['images'][0]['height']; + if (!empty($data['author_name'])) { + $oembed->author_name = $data['author_name']; + } + + if (!empty($data['author_url'])) { + $oembed->author_url = $data['author_url']; + } + + if (!empty($data['images'])) { + $oembed->thumbnail_url = $data['images'][0]['src']; + $oembed->thumbnail_width = $data['images'][0]['width']; + $oembed->thumbnail_height = $data['images'][0]['height']; + } } Hook::callAll('oembed_fetch_url', $embedurl, $oembed); diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index 8bde05f23..cca158c26 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -241,7 +241,7 @@ class ParseUrl $body = $curlResult->getBody(); if ($do_oembed) { - $oembed_data = OEmbed::fetchURL($url); + $oembed_data = OEmbed::fetchURL($url, false, false); if (!empty($oembed_data->type)) { if (!in_array($oembed_data->type, ['error', 'rich', 'image', 'video', 'audio', ''])) { @@ -250,13 +250,25 @@ class ParseUrl // See https://github.com/friendica/friendica/pull/5763#discussion_r217913178 if ($siteinfo['type'] != 'photo') { - if (isset($oembed_data->title)) { + if (!empty($oembed_data->title)) { $siteinfo['title'] = trim($oembed_data->title); } - if (isset($oembed_data->description)) { + if (!empty($oembed_data->description)) { $siteinfo['text'] = trim($oembed_data->description); } - if (isset($oembed_data->thumbnail_url)) { + if (!empty($oembed_data->author_name)) { + $siteinfo['author_name'] = trim($oembed_data->author_name); + } + if (!empty($oembed_data->author_url)) { + $siteinfo['author_url'] = trim($oembed_data->author_url); + } + if (!empty($oembed_data->provider_name)) { + $siteinfo['publisher_name'] = trim($oembed_data->provider_name); + } + if (!empty($oembed_data->provider_url)) { + $siteinfo['publisher_url'] = trim($oembed_data->provider_url); + } + if (!empty($oembed_data->thumbnail_url)) { $siteinfo['image'] = $oembed_data->thumbnail_url; } } From 08771d96c2c5ea27e004991ffd923f9a1f326e87 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 16 Mar 2021 07:15:20 +0000 Subject: [PATCH 04/11] Remove unused parameter --- src/Content/OEmbed.php | 2 +- src/Content/PageInfo.php | 2 +- src/Content/Text/BBCode.php | 6 +++--- src/Protocol/Feed.php | 2 +- src/Util/ParseUrl.php | 19 +++++++------------ 5 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/Content/OEmbed.php b/src/Content/OEmbed.php index bf3c113b7..45e352c15 100644 --- a/src/Content/OEmbed.php +++ b/src/Content/OEmbed.php @@ -151,7 +151,7 @@ class OEmbed // Improve the OEmbed data with data from OpenGraph, Twitter cards and other sources if ($use_parseurl) { - $data = ParseUrl::getSiteinfoCached($embedurl, true, false); + $data = ParseUrl::getSiteinfoCached($embedurl, false); if (($oembed->type == 'error') && empty($data['title']) && empty($data['text'])) { return $oembed; diff --git a/src/Content/PageInfo.php b/src/Content/PageInfo.php index 776dc15df..3cfab3769 100644 --- a/src/Content/PageInfo.php +++ b/src/Content/PageInfo.php @@ -187,7 +187,7 @@ class PageInfo */ public static function queryUrl(string $url, string $photo = '', bool $keywords = false, string $keyword_denylist = '') { - $data = ParseUrl::getSiteinfoCached($url, true); + $data = ParseUrl::getSiteinfoCached($url); if ($photo != '') { $data['images'][0]['src'] = $photo; diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index 6101f5479..eaa5e1640 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -322,14 +322,14 @@ class BBCode $data = ['url' => $url, 'type' => 'photo']; } else { // Checking, if the link goes to a picture - $data = ParseUrl::getSiteinfoCached($pictures[0][1], true); + $data = ParseUrl::getSiteinfoCached($pictures[0][1]); } // Workaround: // Sometimes photo posts to the own album are not detected at the start. // So we seem to cannot use the cache for these cases. That's strange. if (($data['type'] != 'photo') && strstr($pictures[0][1], "/photos/")) { - $data = ParseUrl::getSiteinfo($pictures[0][1], true); + $data = ParseUrl::getSiteinfo($pictures[0][1]); } if ($data['type'] == 'photo') { @@ -416,7 +416,7 @@ class BBCode $post['text'] = trim($body); } } elseif (isset($post['url']) && ($post['type'] == 'video')) { - $data = ParseUrl::getSiteinfoCached($post['url'], true); + $data = ParseUrl::getSiteinfoCached($post['url']); if (isset($data['images'][0])) { $post['image'] = $data['images'][0]['src']; diff --git a/src/Protocol/Feed.php b/src/Protocol/Feed.php index 196d7c04f..afa0d9a23 100644 --- a/src/Protocol/Feed.php +++ b/src/Protocol/Feed.php @@ -484,7 +484,7 @@ class Feed $item["body"] = trim($item["title"]); } - $data = ParseUrl::getSiteinfoCached($item['plink'], true); + $data = ParseUrl::getSiteinfoCached($item['plink']); if (!empty($data['text']) && !empty($data['title']) && (mb_strlen($item['body']) < mb_strlen($data['text']))) { // When the fetched page info text is longer than the body, we do try to enhance the body if (!empty($item['body']) && (strpos($data['title'], $item['body']) === false) && (strpos($data['text'], $item['body']) === false)) { diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index cca158c26..e57be2412 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -75,8 +75,6 @@ class ParseUrl * Search for chached embeddable data of an url otherwise fetch it * * @param string $url The url of the page which should be scraped - * @param bool $no_guessing If true the parse doens't search for - * preview pictures * @param bool $do_oembed The false option is used by the function fetch_oembed() * to avoid endless loops * @@ -85,7 +83,7 @@ class ParseUrl * string 'type' => Content type * string 'title' => (optional) The title of the content * string 'text' => (optional) The description for the content - * string 'image' => (optional) A preview image of the content (only available if $no_geuessing = false) + * string 'image' => (optional) A preview image of the content * array 'images' => (optional) Array of preview pictures * string 'keywords' => (optional) The tags which belong to the content * @@ -93,7 +91,7 @@ class ParseUrl * @see ParseUrl::getSiteinfo() for more information about scraping * embeddable content */ - public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true): array + public static function getSiteinfoCached($url, $do_oembed = true): array { if (empty($url)) { return [ @@ -105,14 +103,14 @@ class ParseUrl $urlHash = hash('sha256', $url); $parsed_url = DBA::selectFirst('parsed_url', ['content'], - ['url_hash' => $urlHash, 'guessing' => !$no_guessing, 'oembed' => $do_oembed] + ['url_hash' => $urlHash, 'oembed' => $do_oembed] ); if (!empty($parsed_url['content'])) { $data = unserialize($parsed_url['content']); return $data; } - $data = self::getSiteinfo($url, $no_guessing, $do_oembed); + $data = self::getSiteinfo($url, $do_oembed); $expires = $data['expires']; @@ -122,7 +120,6 @@ class ParseUrl 'parsed_url', [ 'url_hash' => $urlHash, - 'guessing' => !$no_guessing, 'oembed' => $do_oembed, 'url' => $url, 'content' => serialize($data), @@ -146,8 +143,6 @@ class ParseUrl * \ * * @param string $url The url of the page which should be scraped - * @param bool $no_guessing If true the parse doens't search for - * preview pictures * @param bool $do_oembed The false option is used by the function fetch_oembed() * to avoid endless loops * @param int $count Internal counter to avoid endless loops @@ -157,7 +152,7 @@ class ParseUrl * string 'type' => Content type (error, link, photo, image, audio, video) * string 'title' => (optional) The title of the content * string 'text' => (optional) The description for the content - * string 'image' => (optional) A preview image of the content (only available if $no_guessing = false) + * string 'image' => (optional) A preview image of the content * array 'images' => (optional) Array of preview pictures * string 'keywords' => (optional) The tags which belong to the content * @@ -175,7 +170,7 @@ class ParseUrl * * @endverbatim */ - public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) + public static function getSiteinfo($url, $do_oembed = true, $count = 1) { if (empty($url)) { return [ @@ -343,7 +338,7 @@ class ParseUrl } } if ($content != '') { - $siteinfo = self::getSiteinfo($content, $no_guessing, $do_oembed, ++$count); + $siteinfo = self::getSiteinfo($content, $do_oembed, ++$count); return $siteinfo; } } From 003bf69d8814a927bde40aa8568d3cc4e47ab0b7 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 16 Mar 2021 11:32:56 +0000 Subject: [PATCH 05/11] Changed assigning of author and publisher --- src/Util/ParseUrl.php | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index e57be2412..b9d4b5e45 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -563,6 +563,16 @@ class ParseUrl $jsonldinfo = []; if (!empty($jsonld['publisher']) && is_array($jsonld['publisher'])) { + $content = JsonLD::fetchElement($jsonld, 'publisher', 'name', '@type', 'Person'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_name'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'publisher', 'url', '@type', 'Person'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + $content = JsonLD::fetchElement($jsonld, 'publisher', 'name', '@type', 'Organization'); if (!empty($content) && is_string($content)) { $jsonldinfo['publisher_name'] = trim($content); @@ -574,7 +584,7 @@ class ParseUrl } $brand = JsonLD::fetchElement($jsonld, 'publisher', 'brand', '@type', 'Organization'); - if (!empty($brand)) { + if (!empty($brand) && is_array($brand)) { $content = JsonLD::fetchElement($brand, 'name', '@type', 'brand'); if (!empty($content) && is_string($content)) { $jsonldinfo['publisher_name'] = trim($content); @@ -585,12 +595,12 @@ class ParseUrl if (!empty($jsonld['author']) && is_array($jsonld['author'])) { $content = JsonLD::fetchElement($jsonld, 'author', 'name', '@type', 'Organization'); if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher_name'] = trim($content); + $jsonldinfo['author_name'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'author', 'url', '@type', 'Organization'); if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher_url'] = trim($content); + $jsonldinfo['author_url'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'author', 'name', '@type', 'Person'); From df566eea78ab120e333ba5ef699699bae145cc4d Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 16 Mar 2021 22:57:24 +0000 Subject: [PATCH 06/11] Added more types --- src/Util/ParseUrl.php | 173 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 140 insertions(+), 33 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index b9d4b5e45..6b59acd5e 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -468,13 +468,7 @@ class ParseUrl if (!empty($node->nodeValue)) { $nodevalue = html_entity_decode($node->nodeValue, ENT_COMPAT, 'UTF-8'); if ($jsonld = json_decode($nodevalue, true)) { - if (!empty($jsonld['@graph']) && is_array($jsonld['@graph'])) { - foreach ($jsonld['@graph'] as $part) { - $siteinfo = self::parseJsonLd($siteinfo, $part); - } - } else { - $siteinfo = self::parseJsonLd($siteinfo, $jsonld); - } + $siteinfo = self::parseParts($siteinfo, $jsonld); } } } @@ -513,6 +507,39 @@ class ParseUrl return $siteinfo; } + /** + * Parse the Json-Ld parts + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseParts(array $siteinfo, array $jsonld) + { + if (!empty($jsonld['@graph']) && is_array($jsonld['@graph'])) { + foreach ($jsonld['@graph'] as $part) { + $siteinfo = self::parseJsonLd($siteinfo, $part); + } + } elseif (!empty($jsonld['@type'])) { + $siteinfo = self::parseJsonLd($siteinfo, $jsonld); + } elseif (!empty($jsonld)) { + $keys = array_keys($jsonld); + $numeric_keys = true; + foreach ($keys as $key) { + if (!is_int($key)) { + $numeric_keys = false; + } + } + if ($numeric_keys) { + foreach ($jsonld as $part) { + $siteinfo = self::parseParts($siteinfo, $part); + } + } + } + + return $siteinfo; + } + /** * Improve the siteinfo with information from the provided JSON-LD information * @see https://jsonld.com/ @@ -524,26 +551,52 @@ class ParseUrl private static function parseJsonLd(array $siteinfo, array $jsonld) { $type = JsonLD::fetchElement($jsonld, '@type'); + if (empty($type)) { + Logger::info('Empty type', ['url' => $siteinfo['url']]); + return $siteinfo; + } switch ($type) { case 'Article': case 'NewsArticle': + case 'ScholarlyArticle': + case 'ReportageNewsArticle': return self::parseJsonLdArticle($siteinfo, $jsonld); case 'WebPage': + case 'RadioEpisode': + case 'Event': return self::parseJsonLdWebPage($siteinfo, $jsonld); case 'WebSite': return self::parseJsonLdWebSite($siteinfo, $jsonld); case 'Organization': - return self::parseJsonLdWebOrganization($siteinfo, $jsonld); + case 'NewsMediaOrganization': + case 'LocalBusiness': + return self::parseJsonLdWebOrganization($siteinfo, $jsonld); case 'Person': return self::parseJsonLdWebPerson($siteinfo, $jsonld); - case 'BreadcrumbList': - case 'Audio': /// @todo Can contain direct media links to audio - can be interesting in the future + case 'Audio': + case 'AudioObject': + return self::parseJsonLdAudio($siteinfo, $jsonld); + case 'VideoObject': case 'ImageObject': + + case 'WPHeader': // Temp + case 'WPSideBar': // Temp + case 'WPFooter': // Temp + case 'LiveBlogPosting': case 'SocialMediaPosting': - // quit silently + case 'BreadcrumbList': + case 'ItemList': + case 'LegalService': + case 'MusicGroup': + case 'Blog': + case 'BlogPosting': + case 'Dataset': + case 'CollectionPage': + case 'ImageGallery': + // quit silently return $siteinfo; default: Logger::info('Unsupported or unknown type', ['type' => $type, 'url' => $siteinfo['url']]); @@ -563,22 +616,17 @@ class ParseUrl $jsonldinfo = []; if (!empty($jsonld['publisher']) && is_array($jsonld['publisher'])) { - $content = JsonLD::fetchElement($jsonld, 'publisher', 'name', '@type', 'Person'); + $content = JsonLD::fetchElement($jsonld, 'publisher', 'name'); if (!empty($content) && is_string($content)) { $jsonldinfo['publisher_name'] = trim($content); } - $content = JsonLD::fetchElement($jsonld, 'publisher', 'url', '@type', 'Person'); + $content = JsonLD::fetchElement($jsonld, 'publisher', 'sameAs'); if (!empty($content) && is_string($content)) { $jsonldinfo['publisher_url'] = trim($content); } - $content = JsonLD::fetchElement($jsonld, 'publisher', 'name', '@type', 'Organization'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher_name'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'publisher', 'url', '@type', 'Organization'); + $content = JsonLD::fetchElement($jsonld, 'publisher', 'url'); if (!empty($content) && is_string($content)) { $jsonldinfo['publisher_url'] = trim($content); } @@ -590,28 +638,27 @@ class ParseUrl $jsonldinfo['publisher_name'] = trim($content); } } + } elseif (!empty($jsonld['publisher']) && is_string($jsonld['publisher'])) { + $jsonldinfo['publisher_name'] = trim($jsonld['publisher']); } if (!empty($jsonld['author']) && is_array($jsonld['author'])) { - $content = JsonLD::fetchElement($jsonld, 'author', 'name', '@type', 'Organization'); + $content = JsonLD::fetchElement($jsonld, 'author', 'name'); if (!empty($content) && is_string($content)) { $jsonldinfo['author_name'] = trim($content); } - $content = JsonLD::fetchElement($jsonld, 'author', 'url', '@type', 'Organization'); + $content = JsonLD::fetchElement($jsonld, 'author', 'sameAs'); if (!empty($content) && is_string($content)) { $jsonldinfo['author_url'] = trim($content); } - $content = JsonLD::fetchElement($jsonld, 'author', 'name', '@type', 'Person'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['author_name'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'author', 'url', '@type', 'Person'); + $content = JsonLD::fetchElement($jsonld, 'author', 'url'); if (!empty($content) && is_string($content)) { $jsonldinfo['author_url'] = trim($content); } + } elseif (!empty($jsonld['author']) && is_string($jsonld['author'])) { + $jsonldinfo['author_name'] = trim($jsonld['author']); } Logger::info('Fetched author information', ['fetched' => $jsonldinfo]); @@ -636,7 +683,7 @@ class ParseUrl } $content = JsonLD::fetchElement($jsonld, 'alternativeHeadline'); - if (!empty($content) && is_string($content)) { + if (!empty($content) && is_string($content) && (($jsonldinfo['title'] ?? '') != trim($content))) { $jsonldinfo['alternative_title'] = trim($content); } @@ -645,16 +692,31 @@ class ParseUrl $jsonldinfo['text'] = trim($content); } + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); + if (!empty($content)) { + $jsonldinfo['image'] = trim($content); + } + $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject'); if (!empty($content) && is_string($content)) { $jsonldinfo['image'] = trim($content); } -/// @todo Check for the correct separator, also check for dpublicates before adding -// $content = JsonLD::fetchElement($jsonld, 'keywords'); -// if (!empty($content) && is_string($content)) { -// $jsonldinfo['keywords'] = trim($content); -// } + if (!empty($jsonld['keywords']) && !is_array($jsonld['keywords'])) { + $content = JsonLD::fetchElement($jsonld, 'keywords'); + if (!empty($content)) { + $siteinfo['keywords'] = []; + $keywords = explode(',', $content); + foreach ($keywords as $keyword) { + $siteinfo['keywords'][] = trim($keyword); + } + } + } else { + $content = JsonLD::fetchElementArray($jsonld, 'keywords'); + if (!empty($content) && is_array($content)) { + $jsonldinfo['keywords'] = $content; + } + } $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); @@ -684,6 +746,11 @@ class ParseUrl $jsonldinfo['text'] = trim($content); } + $content = JsonLD::fetchElement($jsonld, 'image'); + if (!empty($content)) { + $jsonldinfo['image'] = trim($content); + } + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); if (!empty($content)) { $jsonldinfo['image'] = trim($content); @@ -749,6 +816,11 @@ class ParseUrl $jsonldinfo['publisher_name'] = trim($content); } + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['publisher_description'] = trim($content); + } + $content = JsonLD::fetchElement($jsonld, 'url'); if (!empty($content)) { $jsonldinfo['publisher_url'] = trim($content); @@ -798,6 +870,41 @@ class ParseUrl return array_merge($siteinfo, $jsonldinfo); } + /** + * Improve the siteinfo with information from the provided JSON-LD Audio information + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdAudio(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['audio_description'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['audio_description'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'contentUrl'); + if (!empty($content)) { + $jsonldinfo['audio_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); + if (!empty($content)) { + $jsonldinfo['audio_img'] = trim($content); + } + + Logger::info('Fetched Audio information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + return array_merge($siteinfo, $jsonldinfo); + } + /** * Convert tags from CSV to an array * From da6b54925a94c092283a80d574e2bd152b1d3bed Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 17 Mar 2021 07:36:16 +0000 Subject: [PATCH 07/11] Added medias --- src/Util/ParseUrl.php | 91 +++++++++++++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 25 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index 6b59acd5e..b9e36a865 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -478,7 +478,7 @@ class ParseUrl $siteinfo['type'] = 'link'; } - if (!empty($siteinfo['image'])) { + if (!empty($siteinfo['image']) && empty($siteinfo['images'])) { $src = self::completeUrl($siteinfo['image'], $url); unset($siteinfo['image']); @@ -561,8 +561,14 @@ class ParseUrl case 'NewsArticle': case 'ScholarlyArticle': case 'ReportageNewsArticle': + case 'SocialMediaPosting': + case 'LiveBlogPosting': + case 'BlogPosting': + case 'DiscussionForumPosting': return self::parseJsonLdArticle($siteinfo, $jsonld); case 'WebPage': + case 'CollectionPage': + case 'ImageGallery': case 'RadioEpisode': case 'Event': return self::parseJsonLdWebPage($siteinfo, $jsonld); @@ -571,31 +577,29 @@ class ParseUrl case 'Organization': case 'NewsMediaOrganization': case 'LocalBusiness': - return self::parseJsonLdWebOrganization($siteinfo, $jsonld); + return self::parseJsonLdWebOrganization($siteinfo, $jsonld); case 'Person': return self::parseJsonLdWebPerson($siteinfo, $jsonld); - case 'Audio': case 'AudioObject': - return self::parseJsonLdAudio($siteinfo, $jsonld); - + case 'Audio': + return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'audio'); case 'VideoObject': + return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'video'); case 'ImageObject': + return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'images'); - case 'WPHeader': // Temp - case 'WPSideBar': // Temp - case 'WPFooter': // Temp + case 'WPHeader': + case 'WPSideBar': + case 'WPFooter': - case 'LiveBlogPosting': - case 'SocialMediaPosting': - case 'BreadcrumbList': - case 'ItemList': case 'LegalService': case 'MusicGroup': + + case 'ItemList': + case 'BreadcrumbList': case 'Blog': - case 'BlogPosting': case 'Dataset': - case 'CollectionPage': - case 'ImageGallery': + case 'Product': // quit silently return $siteinfo; default: @@ -668,6 +672,7 @@ class ParseUrl /** * Improve the siteinfo with information from the provided JSON-LD Article information + * @see https://schema.org/Article * * @param array $siteinfo * @param array $jsonld @@ -727,6 +732,7 @@ class ParseUrl /** * Improve the siteinfo with information from the provided JSON-LD WebPage information + * @see https://schema.org/WebPage * * @param array $siteinfo * @param array $jsonld @@ -765,6 +771,7 @@ class ParseUrl /** * Improve the siteinfo with information from the provided JSON-LD WebSite information + * @see https://schema.org/WebSite * * @param array $siteinfo * @param array $jsonld @@ -802,6 +809,7 @@ class ParseUrl /** * Improve the siteinfo with information from the provided JSON-LD Organization information + * @see https://schema.org/Organization * * @param array $siteinfo * @param array $jsonld @@ -837,6 +845,7 @@ class ParseUrl /** * Improve the siteinfo with information from the provided JSON-LD Person information + * @see https://schema.org/Person * * @param array $siteinfo * @param array $jsonld @@ -871,38 +880,70 @@ class ParseUrl } /** - * Improve the siteinfo with information from the provided JSON-LD Audio information + * Improve the siteinfo with information from the provided JSON-LD MediaObject + * @see https://schema.org/MediaObject * * @param array $siteinfo * @param array $jsonld * @return array siteinfo */ - private static function parseJsonLdAudio(array $siteinfo, array $jsonld) + private static function parseJsonLdMediaObject(array $siteinfo, array $jsonld, string $name) { - $jsonldinfo = []; + $media = []; + + $content = JsonLD::fetchElement($jsonld, 'caption'); + if (!empty($content)) { + $media['caption'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $media['src'] = trim($content); + } $content = JsonLD::fetchElement($jsonld, 'description'); if (!empty($content)) { - $jsonldinfo['audio_description'] = trim($content); + $media['description'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'name'); - if (!empty($content)) { - $jsonldinfo['audio_description'] = trim($content); + if (!empty($content) && (($media['description'] ?? '') != trim($content))) { + $media['name'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'contentUrl'); if (!empty($content)) { - $jsonldinfo['audio_url'] = trim($content); + $media['content'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'embedUrl'); + if (!empty($content)) { + $media['embed'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'height'); + if (!empty($content)) { + $media['height'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'width'); + if (!empty($content)) { + $media['width'] = trim($content); } $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); if (!empty($content)) { - $jsonldinfo['audio_img'] = trim($content); + $media['preview'] = trim($content); } - Logger::info('Fetched Audio information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); - return array_merge($siteinfo, $jsonldinfo); + $content = JsonLD::fetchElement($jsonld, 'image'); + if (!empty($content)) { + $media['image'] = trim($content); + } + + Logger::info('Fetched Media information', ['url' => $siteinfo['url'], 'fetched' => $media]); + $siteinfo[$name][] = $media; + return $siteinfo; } /** From f11712cdc08065b3565cac69524cf075feea63bc Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 17 Mar 2021 17:11:50 +0000 Subject: [PATCH 08/11] Some more added types --- src/Util/ParseUrl.php | 86 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 18 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index b9e36a865..745ab5c74 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -556,29 +556,70 @@ class ParseUrl return $siteinfo; } + // Silently ignore some types that aren't processed + if (in_array($type, ['SiteNavigationElement', 'JobPosting', 'CreativeWork', + 'WPHeader', 'WPSideBar', 'WPFooter', 'LegalService', + 'ItemList', 'BreadcrumbList', 'Blog', 'Dataset', 'Product'])) { + return $siteinfo; + } + switch ($type) { case 'Article': + case 'AdvertiserContentArticle': case 'NewsArticle': + case 'Report': + case 'SatiricalArticle': case 'ScholarlyArticle': + case 'SocialMediaPosting': + case 'TechArticle': case 'ReportageNewsArticle': case 'SocialMediaPosting': - case 'LiveBlogPosting': case 'BlogPosting': + case 'LiveBlogPosting': case 'DiscussionForumPosting': return self::parseJsonLdArticle($siteinfo, $jsonld); case 'WebPage': + case 'AboutPage': + case 'CheckoutPage': case 'CollectionPage': + case 'ContactPage': + case 'FAQPage': + case 'ItemPage': + case 'MedicalWebPage': + case 'ProfilePage': + case 'QAPage': + case 'RealEstateListing': + case 'SearchResultsPage': + case 'MediaGallery': case 'ImageGallery': + case 'VideoGallery': case 'RadioEpisode': case 'Event': return self::parseJsonLdWebPage($siteinfo, $jsonld); case 'WebSite': return self::parseJsonLdWebSite($siteinfo, $jsonld); case 'Organization': - case 'NewsMediaOrganization': + case 'Airline': + case 'Consortium': + case 'Corporation': + case 'EducationalOrganization': + case 'FundingScheme': + case 'GovernmentOrganization': + case 'LibrarySystem': case 'LocalBusiness': + case 'MedicalOrganization': + case 'NGO': + case 'NewsMediaOrganization': + case 'Project': + case 'SportsOrganization': + case 'WorkersUnion': return self::parseJsonLdWebOrganization($siteinfo, $jsonld); case 'Person': + case 'Patient': + case 'PerformingGroup': + case 'DanceGroup'; + case 'MusicGroup': + case 'TheaterGroup': return self::parseJsonLdWebPerson($siteinfo, $jsonld); case 'AudioObject': case 'Audio': @@ -587,23 +628,8 @@ class ParseUrl return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'video'); case 'ImageObject': return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'images'); - - case 'WPHeader': - case 'WPSideBar': - case 'WPFooter': - - case 'LegalService': - case 'MusicGroup': - - case 'ItemList': - case 'BreadcrumbList': - case 'Blog': - case 'Dataset': - case 'Product': - // quit silently - return $siteinfo; default: - Logger::info('Unsupported or unknown type', ['type' => $type, 'url' => $siteinfo['url']]); + Logger::info('Unknown type', ['type' => $type, 'url' => $siteinfo['url']]); return $siteinfo; } } @@ -641,6 +667,10 @@ class ParseUrl if (!empty($content) && is_string($content)) { $jsonldinfo['publisher_name'] = trim($content); } + $content = JsonLD::fetchElement($brand, 'url', '@type', 'brand'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } } } elseif (!empty($jsonld['publisher']) && is_string($jsonld['publisher'])) { $jsonldinfo['publisher_name'] = trim($jsonld['publisher']); @@ -829,6 +859,11 @@ class ParseUrl $jsonldinfo['publisher_description'] = trim($content); } + $content = JsonLD::fetchElement($jsonld, 'sameAs'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + $content = JsonLD::fetchElement($jsonld, 'url'); if (!empty($content)) { $jsonldinfo['publisher_url'] = trim($content); @@ -839,6 +874,16 @@ class ParseUrl $jsonldinfo['publisher_img'] = trim($content); } + $content = JsonLD::fetchElement($jsonld, 'brand', 'name', '@type', 'Organization'); + if (!empty($content)) { + $jsonldinfo['publisher_name'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'brand', 'url', '@type', 'Organization'); + if (!empty($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + Logger::info('Fetched Organization information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); return array_merge($siteinfo, $jsonldinfo); } @@ -865,6 +910,11 @@ class ParseUrl $jsonldinfo['author_description'] = trim($content); } + $content = JsonLD::fetchElement($jsonld, 'sameAs'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['author_url'] = trim($content); + } + $content = JsonLD::fetchElement($jsonld, 'url'); if (!empty($content)) { $jsonldinfo['author_url'] = trim($content); From 558189e9d12cf88461506f971be1729c68cdb3d5 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 17 Mar 2021 22:29:12 +0000 Subject: [PATCH 09/11] Reorganized functions --- src/Util/ParseUrl.php | 979 +++++++++++++++++++++--------------------- 1 file changed, 490 insertions(+), 489 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index 745ab5c74..389c94a72 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -507,495 +507,6 @@ class ParseUrl return $siteinfo; } - /** - * Parse the Json-Ld parts - * - * @param array $siteinfo - * @param array $jsonld - * @return array siteinfo - */ - private static function parseParts(array $siteinfo, array $jsonld) - { - if (!empty($jsonld['@graph']) && is_array($jsonld['@graph'])) { - foreach ($jsonld['@graph'] as $part) { - $siteinfo = self::parseJsonLd($siteinfo, $part); - } - } elseif (!empty($jsonld['@type'])) { - $siteinfo = self::parseJsonLd($siteinfo, $jsonld); - } elseif (!empty($jsonld)) { - $keys = array_keys($jsonld); - $numeric_keys = true; - foreach ($keys as $key) { - if (!is_int($key)) { - $numeric_keys = false; - } - } - if ($numeric_keys) { - foreach ($jsonld as $part) { - $siteinfo = self::parseParts($siteinfo, $part); - } - } - } - - return $siteinfo; - } - - /** - * Improve the siteinfo with information from the provided JSON-LD information - * @see https://jsonld.com/ - * - * @param array $siteinfo - * @param array $jsonld - * @return array siteinfo - */ - private static function parseJsonLd(array $siteinfo, array $jsonld) - { - $type = JsonLD::fetchElement($jsonld, '@type'); - if (empty($type)) { - Logger::info('Empty type', ['url' => $siteinfo['url']]); - return $siteinfo; - } - - // Silently ignore some types that aren't processed - if (in_array($type, ['SiteNavigationElement', 'JobPosting', 'CreativeWork', - 'WPHeader', 'WPSideBar', 'WPFooter', 'LegalService', - 'ItemList', 'BreadcrumbList', 'Blog', 'Dataset', 'Product'])) { - return $siteinfo; - } - - switch ($type) { - case 'Article': - case 'AdvertiserContentArticle': - case 'NewsArticle': - case 'Report': - case 'SatiricalArticle': - case 'ScholarlyArticle': - case 'SocialMediaPosting': - case 'TechArticle': - case 'ReportageNewsArticle': - case 'SocialMediaPosting': - case 'BlogPosting': - case 'LiveBlogPosting': - case 'DiscussionForumPosting': - return self::parseJsonLdArticle($siteinfo, $jsonld); - case 'WebPage': - case 'AboutPage': - case 'CheckoutPage': - case 'CollectionPage': - case 'ContactPage': - case 'FAQPage': - case 'ItemPage': - case 'MedicalWebPage': - case 'ProfilePage': - case 'QAPage': - case 'RealEstateListing': - case 'SearchResultsPage': - case 'MediaGallery': - case 'ImageGallery': - case 'VideoGallery': - case 'RadioEpisode': - case 'Event': - return self::parseJsonLdWebPage($siteinfo, $jsonld); - case 'WebSite': - return self::parseJsonLdWebSite($siteinfo, $jsonld); - case 'Organization': - case 'Airline': - case 'Consortium': - case 'Corporation': - case 'EducationalOrganization': - case 'FundingScheme': - case 'GovernmentOrganization': - case 'LibrarySystem': - case 'LocalBusiness': - case 'MedicalOrganization': - case 'NGO': - case 'NewsMediaOrganization': - case 'Project': - case 'SportsOrganization': - case 'WorkersUnion': - return self::parseJsonLdWebOrganization($siteinfo, $jsonld); - case 'Person': - case 'Patient': - case 'PerformingGroup': - case 'DanceGroup'; - case 'MusicGroup': - case 'TheaterGroup': - return self::parseJsonLdWebPerson($siteinfo, $jsonld); - case 'AudioObject': - case 'Audio': - return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'audio'); - case 'VideoObject': - return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'video'); - case 'ImageObject': - return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'images'); - default: - Logger::info('Unknown type', ['type' => $type, 'url' => $siteinfo['url']]); - return $siteinfo; - } - } - - /** - * Improve the siteinfo with information from the provided JSON-LD information concerning authors and publishers - * - * @param array $siteinfo - * @param array $jsonld - * @return array siteinfo - */ - private static function parseJsonLdAuthor(array $siteinfo, array $jsonld) - { - $jsonldinfo = []; - - if (!empty($jsonld['publisher']) && is_array($jsonld['publisher'])) { - $content = JsonLD::fetchElement($jsonld, 'publisher', 'name'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher_name'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'publisher', 'sameAs'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher_url'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'publisher', 'url'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher_url'] = trim($content); - } - - $brand = JsonLD::fetchElement($jsonld, 'publisher', 'brand', '@type', 'Organization'); - if (!empty($brand) && is_array($brand)) { - $content = JsonLD::fetchElement($brand, 'name', '@type', 'brand'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher_name'] = trim($content); - } - $content = JsonLD::fetchElement($brand, 'url', '@type', 'brand'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher_url'] = trim($content); - } - } - } elseif (!empty($jsonld['publisher']) && is_string($jsonld['publisher'])) { - $jsonldinfo['publisher_name'] = trim($jsonld['publisher']); - } - - if (!empty($jsonld['author']) && is_array($jsonld['author'])) { - $content = JsonLD::fetchElement($jsonld, 'author', 'name'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['author_name'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'author', 'sameAs'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['author_url'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'author', 'url'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['author_url'] = trim($content); - } - } elseif (!empty($jsonld['author']) && is_string($jsonld['author'])) { - $jsonldinfo['author_name'] = trim($jsonld['author']); - } - - Logger::info('Fetched author information', ['fetched' => $jsonldinfo]); - - return array_merge($siteinfo, $jsonldinfo); - } - - /** - * Improve the siteinfo with information from the provided JSON-LD Article information - * @see https://schema.org/Article - * - * @param array $siteinfo - * @param array $jsonld - * @return array siteinfo - */ - private static function parseJsonLdArticle(array $siteinfo, array $jsonld) - { - $jsonldinfo = []; - - $content = JsonLD::fetchElement($jsonld, 'headline'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['title'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'alternativeHeadline'); - if (!empty($content) && is_string($content) && (($jsonldinfo['title'] ?? '') != trim($content))) { - $jsonldinfo['alternative_title'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'description'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['text'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); - if (!empty($content)) { - $jsonldinfo['image'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['image'] = trim($content); - } - - if (!empty($jsonld['keywords']) && !is_array($jsonld['keywords'])) { - $content = JsonLD::fetchElement($jsonld, 'keywords'); - if (!empty($content)) { - $siteinfo['keywords'] = []; - $keywords = explode(',', $content); - foreach ($keywords as $keyword) { - $siteinfo['keywords'][] = trim($keyword); - } - } - } else { - $content = JsonLD::fetchElementArray($jsonld, 'keywords'); - if (!empty($content) && is_array($content)) { - $jsonldinfo['keywords'] = $content; - } - } - - $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); - - Logger::info('Fetched article information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); - - return array_merge($siteinfo, $jsonldinfo); - } - - /** - * Improve the siteinfo with information from the provided JSON-LD WebPage information - * @see https://schema.org/WebPage - * - * @param array $siteinfo - * @param array $jsonld - * @return array siteinfo - */ - private static function parseJsonLdWebPage(array $siteinfo, array $jsonld) - { - $jsonldinfo = []; - - $content = JsonLD::fetchElement($jsonld, 'name'); - if (!empty($content)) { - $jsonldinfo['title'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'description'); - if (!empty($content)) { - $jsonldinfo['text'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'image'); - if (!empty($content)) { - $jsonldinfo['image'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); - if (!empty($content)) { - $jsonldinfo['image'] = trim($content); - } - - $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); - - Logger::info('Fetched webpage information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); - - return array_merge($siteinfo, $jsonldinfo); - } - - /** - * Improve the siteinfo with information from the provided JSON-LD WebSite information - * @see https://schema.org/WebSite - * - * @param array $siteinfo - * @param array $jsonld - * @return array siteinfo - */ - private static function parseJsonLdWebSite(array $siteinfo, array $jsonld) - { - $jsonldinfo = []; - - $content = JsonLD::fetchElement($jsonld, 'name'); - if (!empty($content)) { - $jsonldinfo['publisher_name'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'description'); - if (!empty($content)) { - $jsonldinfo['publisher_description'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'url'); - if (!empty($content)) { - $jsonldinfo['publisher_url'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); - if (!empty($content)) { - $jsonldinfo['image'] = trim($content); - } - - $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); - - Logger::info('Fetched WebSite information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); - return array_merge($siteinfo, $jsonldinfo); - } - - /** - * Improve the siteinfo with information from the provided JSON-LD Organization information - * @see https://schema.org/Organization - * - * @param array $siteinfo - * @param array $jsonld - * @return array siteinfo - */ - private static function parseJsonLdWebOrganization(array $siteinfo, array $jsonld) - { - $jsonldinfo = []; - - $content = JsonLD::fetchElement($jsonld, 'name'); - if (!empty($content)) { - $jsonldinfo['publisher_name'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'description'); - if (!empty($content)) { - $jsonldinfo['publisher_description'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'sameAs'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['publisher_url'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'url'); - if (!empty($content)) { - $jsonldinfo['publisher_url'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'logo', 'url', '@type', 'ImageObject'); - if (!empty($content)) { - $jsonldinfo['publisher_img'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'brand', 'name', '@type', 'Organization'); - if (!empty($content)) { - $jsonldinfo['publisher_name'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'brand', 'url', '@type', 'Organization'); - if (!empty($content)) { - $jsonldinfo['publisher_url'] = trim($content); - } - - Logger::info('Fetched Organization information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); - return array_merge($siteinfo, $jsonldinfo); - } - - /** - * Improve the siteinfo with information from the provided JSON-LD Person information - * @see https://schema.org/Person - * - * @param array $siteinfo - * @param array $jsonld - * @return array siteinfo - */ - private static function parseJsonLdWebPerson(array $siteinfo, array $jsonld) - { - $jsonldinfo = []; - - $content = JsonLD::fetchElement($jsonld, 'name'); - if (!empty($content)) { - $jsonldinfo['author_name'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'description'); - if (!empty($content)) { - $jsonldinfo['author_description'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'sameAs'); - if (!empty($content) && is_string($content)) { - $jsonldinfo['author_url'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'url'); - if (!empty($content)) { - $jsonldinfo['author_url'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject'); - if (!empty($content)) { - $jsonldinfo['author_img'] = trim($content); - } - - Logger::info('Fetched Person information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); - return array_merge($siteinfo, $jsonldinfo); - } - - /** - * Improve the siteinfo with information from the provided JSON-LD MediaObject - * @see https://schema.org/MediaObject - * - * @param array $siteinfo - * @param array $jsonld - * @return array siteinfo - */ - private static function parseJsonLdMediaObject(array $siteinfo, array $jsonld, string $name) - { - $media = []; - - $content = JsonLD::fetchElement($jsonld, 'caption'); - if (!empty($content)) { - $media['caption'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'url'); - if (!empty($content)) { - $media['src'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'description'); - if (!empty($content)) { - $media['description'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'name'); - if (!empty($content) && (($media['description'] ?? '') != trim($content))) { - $media['name'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'contentUrl'); - if (!empty($content)) { - $media['content'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'embedUrl'); - if (!empty($content)) { - $media['embed'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'height'); - if (!empty($content)) { - $media['height'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'width'); - if (!empty($content)) { - $media['width'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); - if (!empty($content)) { - $media['preview'] = trim($content); - } - - $content = JsonLD::fetchElement($jsonld, 'image'); - if (!empty($content)) { - $media['image'] = trim($content); - } - - Logger::info('Fetched Media information', ['url' => $siteinfo['url'], 'fetched' => $media]); - $siteinfo[$name][] = $media; - return $siteinfo; - } - /** * Convert tags from CSV to an array * @@ -1077,4 +588,494 @@ class ParseUrl return($complete); } + + /** + * Parse the Json-Ld parts of a web page + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseParts(array $siteinfo, array $jsonld) + { + if (!empty($jsonld['@graph']) && is_array($jsonld['@graph'])) { + foreach ($jsonld['@graph'] as $part) { + $siteinfo = self::parseJsonLd($siteinfo, $part); + } + } elseif (!empty($jsonld['@type'])) { + $siteinfo = self::parseJsonLd($siteinfo, $jsonld); + } elseif (!empty($jsonld)) { + $keys = array_keys($jsonld); + $numeric_keys = true; + foreach ($keys as $key) { + if (!is_int($key)) { + $numeric_keys = false; + } + } + if ($numeric_keys) { + foreach ($jsonld as $part) { + $siteinfo = self::parseParts($siteinfo, $part); + } + } + } + + return $siteinfo; + } + + /** + * Improve the siteinfo with information from the provided JSON-LD information + * @see https://jsonld.com/ + * @see https://schema.org/ + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLd(array $siteinfo, array $jsonld) + { + $type = JsonLD::fetchElement($jsonld, '@type'); + if (empty($type)) { + Logger::info('Empty type', ['url' => $siteinfo['url']]); + return $siteinfo; + } + + // Silently ignore some types that aren't processed + if (in_array($type, ['SiteNavigationElement', 'JobPosting', 'CreativeWork', 'MusicAlbum', + 'WPHeader', 'WPSideBar', 'WPFooter', 'LegalService', + 'ItemList', 'BreadcrumbList', 'Blog', 'Dataset', 'Product'])) { + return $siteinfo; + } + + switch ($type) { + case 'Article': + case 'AdvertiserContentArticle': + case 'NewsArticle': + case 'Report': + case 'SatiricalArticle': + case 'ScholarlyArticle': + case 'SocialMediaPosting': + case 'TechArticle': + case 'ReportageNewsArticle': + case 'SocialMediaPosting': + case 'BlogPosting': + case 'LiveBlogPosting': + case 'DiscussionForumPosting': + return self::parseJsonLdArticle($siteinfo, $jsonld); + case 'WebPage': + case 'AboutPage': + case 'CheckoutPage': + case 'CollectionPage': + case 'ContactPage': + case 'FAQPage': + case 'ItemPage': + case 'MedicalWebPage': + case 'ProfilePage': + case 'QAPage': + case 'RealEstateListing': + case 'SearchResultsPage': + case 'MediaGallery': + case 'ImageGallery': + case 'VideoGallery': + case 'RadioEpisode': + case 'Event': + return self::parseJsonLdWebPage($siteinfo, $jsonld); + case 'WebSite': + return self::parseJsonLdWebSite($siteinfo, $jsonld); + case 'Organization': + case 'Airline': + case 'Consortium': + case 'Corporation': + case 'EducationalOrganization': + case 'FundingScheme': + case 'GovernmentOrganization': + case 'LibrarySystem': + case 'LocalBusiness': + case 'MedicalOrganization': + case 'NGO': + case 'NewsMediaOrganization': + case 'Project': + case 'SportsOrganization': + case 'WorkersUnion': + return self::parseJsonLdWebOrganization($siteinfo, $jsonld); + case 'Person': + case 'Patient': + case 'PerformingGroup': + case 'DanceGroup'; + case 'MusicGroup': + case 'TheaterGroup': + return self::parseJsonLdWebPerson($siteinfo, $jsonld); + case 'AudioObject': + case 'Audio': + return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'audio'); + case 'VideoObject': + return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'video'); + case 'ImageObject': + return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'images'); + default: + Logger::info('Unknown type', ['type' => $type, 'url' => $siteinfo['url']]); + return $siteinfo; + } + } + + /** + * Fetch author and publisher data + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdAuthor(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + if (!empty($jsonld['publisher']) && is_array($jsonld['publisher'])) { + $content = JsonLD::fetchElement($jsonld, 'publisher', 'name'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_name'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'publisher', 'sameAs'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'publisher', 'url'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $brand = JsonLD::fetchElement($jsonld, 'publisher', 'brand', '@type', 'Organization'); + if (!empty($brand) && is_array($brand)) { + $content = JsonLD::fetchElement($brand, 'name', '@type', 'brand'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_name'] = trim($content); + } + $content = JsonLD::fetchElement($brand, 'url', '@type', 'brand'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + } + } elseif (!empty($jsonld['publisher']) && is_string($jsonld['publisher'])) { + $jsonldinfo['publisher_name'] = trim($jsonld['publisher']); + } + + if (!empty($jsonld['author']) && is_array($jsonld['author'])) { + $content = JsonLD::fetchElement($jsonld, 'author', 'name'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['author_name'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'author', 'sameAs'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['author_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'author', 'url'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['author_url'] = trim($content); + } + } elseif (!empty($jsonld['author']) && is_string($jsonld['author'])) { + $jsonldinfo['author_name'] = trim($jsonld['author']); + } + + Logger::info('Fetched Author information', ['fetched' => $jsonldinfo]); + + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Fetch data from the provided JSON-LD Article type + * @see https://schema.org/Article + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdArticle(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'headline'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['title'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'alternativeHeadline'); + if (!empty($content) && is_string($content) && (($jsonldinfo['title'] ?? '') != trim($content))) { + $jsonldinfo['alternative_title'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['text'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); + if (!empty($content)) { + $jsonldinfo['image'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['image'] = trim($content); + } + + if (!empty($jsonld['keywords']) && !is_array($jsonld['keywords'])) { + $content = JsonLD::fetchElement($jsonld, 'keywords'); + if (!empty($content)) { + $siteinfo['keywords'] = []; + $keywords = explode(',', $content); + foreach ($keywords as $keyword) { + $siteinfo['keywords'][] = trim($keyword); + } + } + } else { + $content = JsonLD::fetchElementArray($jsonld, 'keywords'); + if (!empty($content) && is_array($content)) { + $jsonldinfo['keywords'] = $content; + } + } + + $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); + + Logger::info('Fetched article information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Fetch data from the provided JSON-LD WebPage type + * @see https://schema.org/WebPage + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebPage(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['title'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['text'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'image'); + if (!empty($content)) { + $jsonldinfo['image'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); + if (!empty($content)) { + $jsonldinfo['image'] = trim($content); + } + + $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); + + Logger::info('Fetched WebPage information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Fetch data from the provided JSON-LD WebSite type + * @see https://schema.org/WebSite + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebSite(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['publisher_name'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['publisher_description'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); + if (!empty($content)) { + $jsonldinfo['image'] = trim($content); + } + + $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld); + + Logger::info('Fetched WebSite information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Fetch data from the provided JSON-LD Organization type + * @see https://schema.org/Organization + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebOrganization(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['publisher_name'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['publisher_description'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'sameAs'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'logo', 'url', '@type', 'ImageObject'); + if (!empty($content)) { + $jsonldinfo['publisher_img'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'brand', 'name', '@type', 'Organization'); + if (!empty($content)) { + $jsonldinfo['publisher_name'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'brand', 'url', '@type', 'Organization'); + if (!empty($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + Logger::info('Fetched Organization information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Fetch data from the provided JSON-LD Person type + * @see https://schema.org/Person + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdWebPerson(array $siteinfo, array $jsonld) + { + $jsonldinfo = []; + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content)) { + $jsonldinfo['author_name'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $jsonldinfo['author_description'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'sameAs'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['author_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $jsonldinfo['author_url'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject'); + if (!empty($content)) { + $jsonldinfo['author_img'] = trim($content); + } + + Logger::info('Fetched Person information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]); + return array_merge($siteinfo, $jsonldinfo); + } + + /** + * Fetch data from the provided JSON-LD MediaObject type + * @see https://schema.org/MediaObject + * + * @param array $siteinfo + * @param array $jsonld + * @return array siteinfo + */ + private static function parseJsonLdMediaObject(array $siteinfo, array $jsonld, string $name) + { + $media = []; + + $content = JsonLD::fetchElement($jsonld, 'caption'); + if (!empty($content)) { + $media['caption'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'url'); + if (!empty($content)) { + $media['src'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'description'); + if (!empty($content)) { + $media['description'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'name'); + if (!empty($content) && (($media['description'] ?? '') != trim($content))) { + $media['name'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'contentUrl'); + if (!empty($content)) { + $media['content'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'embedUrl'); + if (!empty($content)) { + $media['embed'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'height'); + if (!empty($content)) { + $media['height'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'width'); + if (!empty($content)) { + $media['width'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl'); + if (!empty($content)) { + $media['preview'] = trim($content); + } + + $content = JsonLD::fetchElement($jsonld, 'image'); + if (!empty($content)) { + $media['image'] = trim($content); + } + + Logger::info('Fetched Media information', ['url' => $siteinfo['url'], 'fetched' => $media]); + $siteinfo[$name][] = $media; + return $siteinfo; + } } From e3409c72e24889bdaf79fdd9dca4621c37e91bdd Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 17 Mar 2021 23:31:16 +0000 Subject: [PATCH 10/11] Add logos for person and organisation --- src/Util/ParseUrl.php | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index 389c94a72..6dc4c2bea 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -746,14 +746,33 @@ class ParseUrl $brand = JsonLD::fetchElement($jsonld, 'publisher', 'brand', '@type', 'Organization'); if (!empty($brand) && is_array($brand)) { - $content = JsonLD::fetchElement($brand, 'name', '@type', 'brand'); + $content = JsonLD::fetchElement($brand, 'name'); if (!empty($content) && is_string($content)) { $jsonldinfo['publisher_name'] = trim($content); } - $content = JsonLD::fetchElement($brand, 'url', '@type', 'brand'); + + $content = JsonLD::fetchElement($brand, 'sameAs'); if (!empty($content) && is_string($content)) { $jsonldinfo['publisher_url'] = trim($content); } + + $content = JsonLD::fetchElement($brand, 'url'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_url'] = trim($content); + } + + $content = JsonLD::fetchElement($brand, 'logo', 'url'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_img'] = trim($content); + } + } + + $logo = JsonLD::fetchElement($jsonld, 'publisher', 'logo'); + if (!empty($logo) && is_array($logo)) { + $content = JsonLD::fetchElement($logo, 'url'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['publisher_img'] = trim($content); + } } } elseif (!empty($jsonld['publisher']) && is_string($jsonld['publisher'])) { $jsonldinfo['publisher_name'] = trim($jsonld['publisher']); @@ -774,6 +793,14 @@ class ParseUrl if (!empty($content) && is_string($content)) { $jsonldinfo['author_url'] = trim($content); } + + $logo = JsonLD::fetchElement($jsonld, 'author', 'logo'); + if (!empty($logo) && is_array($logo)) { + $content = JsonLD::fetchElement($logo, 'url'); + if (!empty($content) && is_string($content)) { + $jsonldinfo['author_img'] = trim($content); + } + } } elseif (!empty($jsonld['author']) && is_string($jsonld['author'])) { $jsonldinfo['author_name'] = trim($jsonld['author']); } From f772e11cfa0d6490ab25d8477b117beb5b82c72d Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 18 Mar 2021 08:04:52 +0000 Subject: [PATCH 11/11] Avoid empty type --- src/Util/ParseUrl.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index 6dc4c2bea..3ed48a077 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -600,7 +600,7 @@ class ParseUrl { if (!empty($jsonld['@graph']) && is_array($jsonld['@graph'])) { foreach ($jsonld['@graph'] as $part) { - $siteinfo = self::parseJsonLd($siteinfo, $part); + $siteinfo = self::parseParts($siteinfo, $part); } } elseif (!empty($jsonld['@type'])) { $siteinfo = self::parseJsonLd($siteinfo, $jsonld); @@ -641,7 +641,7 @@ class ParseUrl // Silently ignore some types that aren't processed if (in_array($type, ['SiteNavigationElement', 'JobPosting', 'CreativeWork', 'MusicAlbum', - 'WPHeader', 'WPSideBar', 'WPFooter', 'LegalService', + 'WPHeader', 'WPSideBar', 'WPFooter', 'LegalService', 'MusicRecording', 'ItemList', 'BreadcrumbList', 'Blog', 'Dataset', 'Product'])) { return $siteinfo; }