ParseUrl: some docu work

This commit is contained in:
rabuzarus 2016-11-27 23:41:55 +01:00
parent 85b51ee41c
commit f229d65f85
2 changed files with 80 additions and 14 deletions

View file

@ -19,6 +19,28 @@ require_once("include/xml.php");
*/ */
class ParseUrl { class ParseUrl {
/**
* @brief Search for chached embeddable data of an url otherwise fetch it
*
* @param type $url The url of the page which should be scraped
* @param type $no_guessing If true the parse doens't search for
* preview pictures
* @param type $do_oembed The false option is used by the function fetch_oembed()
* to avoid endless loops
*
* @return array which contains needed data for embedding
* string 'url' => The url of the parsed page
* string 'type' => Content type
* string 'title' => The title of the content
* string 'text' => The description for the content
* string 'image' => A preview image of the content (only available
* if $no_geuessing = false
* array'images' = Array of preview pictures
* string 'keywords' => The tags which belong to the content
*
* @see ParseUrl::getSiteinfo() for more information about scraping
* embeddable content
*/
public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) { public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) {
if ($url == "") { if ($url == "") {
@ -47,7 +69,46 @@ class ParseUrl {
return $data; return $data;
} }
/**
* @brief Parse a page for embeddable content information
*
* This method parses to url for meta data which can be used to embed
* the content. If available it prioritizes Open Graph meta tags.
* If this is not available it uses the twitter cards meta tags.
* As fallback it uses standard html elements with meta informations
* like \<title\>Awesome Title\</title\> or
* \<meta name="description" content="An awesome description"\>
*
* @param type $url The url of the page which should be scraped
* @param type $no_guessing If true the parse doens't search for
* preview pictures
* @param type $do_oembed The false option is used by the function fetch_oembed()
* to avoid endless loops
* @param type $count Internal counter to avoid endless loops
*
* @return array which contains needed data for embedding
* string 'url' => The url of the parsed page
* string 'type' => Content type
* string 'title' => The title of the content
* string 'text' => The description for the content
* string 'image' => A preview image of the content (only available
* if $no_geuessing = false
* array'images' = Array of preview pictures
* string 'keywords' => The tags which belong to the content
*
* @todo https://developers.google.com/+/plugins/snippet/
* @verbatim
* <meta itemprop="name" content="Awesome title">
* <meta itemprop="description" content="An awesome description">
* <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
*
* <body itemscope itemtype="http://schema.org/Product">
* <h1 itemprop="name">Shiny Trinket</h1>
* <img itemprop="image" src="{image-url}" />
* <p itemprop="description">Shiny trinkets are shiny.</p>
* </body>
* @endverbatim
*/
public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) { public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) {
$a = get_app(); $a = get_app();
@ -441,9 +502,25 @@ class ParseUrl {
$tag = "#" . $tag; $tag = "#" . $tag;
} }
/**
* @brief Add a scheme to an url
*
* The src attribute of some html elements (e.g. images)
* can miss the scheme so we need to add the correct
* scheme
*
* @param string $url The url which possibly does have
* a missing scheme (a link to an image)
* @param string $scheme The url with a correct scheme
* (e.g. the url from the webpage which does contain the image)
*
* @return string The url with a scheme
*/
private static function completeUrl($url, $scheme) { private static function completeUrl($url, $scheme) {
$urlarr = parse_url($url); $urlarr = parse_url($url);
// If the url does allready have an scheme
// we can stop the process here
if (isset($urlarr["scheme"])) { if (isset($urlarr["scheme"])) {
return($url); return($url);
} }

View file

@ -1,4 +1,5 @@
<?php <?php
/** /**
* @file mod/parse_url.php * @file mod/parse_url.php
* @brief The parse_url module * @brief The parse_url module
@ -11,19 +12,7 @@
* the richtext editor doesn't support all kind of html). * the richtext editor doesn't support all kind of html).
* Otherwise the output will be constructed BBCode. * Otherwise the output will be constructed BBCode.
* *
* @todo https://developers.google.com/+/plugins/snippet/ * @see ParseUrl::getSiteinfo() for more information about scraping embeddable content
*
* @verbatim
* <meta itemprop="name" content="Toller Titel">
* <meta itemprop="description" content="Eine tolle Beschreibung">
* <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
*
* <body itemscope itemtype="http://schema.org/Product">
* <h1 itemprop="name">Shiny Trinket</h1>
* <img itemprop="image" src="{image-url}" />
* <p itemprop="description">Shiny trinkets are shiny.</p>
* </body>
* @endverbatim
*/ */
use \Friendica\ParseUrl; use \Friendica\ParseUrl;