From 99cfae63d7303365d0c4b2256c7194edb590fb7f Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Fri, 17 Feb 2017 22:32:33 -0500 Subject: [PATCH 1/4] Clean trailing whitespaces --- include/ParseUrl.php | 30 +++++++++++++++--------------- include/network.php | 26 +++++++++++++------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/include/ParseUrl.php b/include/ParseUrl.php index 549d705da..b85175a25 100644 --- a/include/ParseUrl.php +++ b/include/ParseUrl.php @@ -21,13 +21,13 @@ class ParseUrl { /** * @brief Search for chached embeddable data of an url otherwise fetch it - * + * * @param type $url The url of the page which should be scraped * @param type $no_guessing If true the parse doens't search for * preview pictures * @param type $do_oembed The false option is used by the function fetch_oembed() * to avoid endless loops - * + * * @return array which contains needed data for embedding * string 'url' => The url of the parsed page * string 'type' => Content type @@ -37,9 +37,9 @@ class ParseUrl { * if $no_geuessing = false * array'images' = Array of preview pictures * string 'keywords' => The tags which belong to the content - * + * * @see ParseUrl::getSiteinfo() for more information about scraping - * embeddable content + * embeddable content */ public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) { @@ -71,21 +71,21 @@ class ParseUrl { } /** * @brief Parse a page for embeddable content information - * + * * This method parses to url for meta data which can be used to embed * the content. If available it prioritizes Open Graph meta tags. * If this is not available it uses the twitter cards meta tags. * As fallback it uses standard html elements with meta informations * like \Awesome Title\ or * \ - * + * * @param type $url The url of the page which should be scraped * @param type $no_guessing If true the parse doens't search for * preview pictures * @param type $do_oembed The false option is used by the function fetch_oembed() * to avoid endless loops * @param type $count Internal counter to avoid endless loops - * + * * @return array which contains needed data for embedding * string 'url' => The url of the parsed page * string 'type' => Content type @@ -95,13 +95,13 @@ class ParseUrl { * if $no_geuessing = false * array'images' = Array of preview pictures * string 'keywords' => The tags which belong to the content - * + * * @todo https://developers.google.com/+/plugins/snippet/ * @verbatim * * * - * + * * *

Shiny Trinket

* @@ -476,7 +476,7 @@ class ParseUrl { /** * @brief Convert tags from CSV to an array - * + * * @param string $string Tags * @return array with formatted Hashtags */ @@ -492,9 +492,9 @@ class ParseUrl { /** * @brief Add a hasht sign to a string - * + * * This method is used as callback function - * + * * @param string $tag The pure tag name * @param int $k Counter for internal use */ @@ -504,16 +504,16 @@ class ParseUrl { /** * @brief Add a scheme to an url - * + * * The src attribute of some html elements (e.g. images) * can miss the scheme so we need to add the correct * scheme - * + * * @param string $url The url which possibly does have * a missing scheme (a link to an image) * @param string $scheme The url with a correct scheme * (e.g. the url from the webpage which does contain the image) - * + * * @return string The url with a scheme */ private static function completeUrl($url, $scheme) { diff --git a/include/network.php b/include/network.php index b7839de21..7385c94a0 100644 --- a/include/network.php +++ b/include/network.php @@ -11,11 +11,11 @@ require_once('include/Probe.php'); /** * @brief Curl wrapper - * + * * If binary flag is true, return binary results. * Set the cookiejar argument to a string (e.g. "/tmp/friendica-cookies.txt") * to preserve cookies from one request to the next. - * + * * @param string $url URL to fetch * @param boolean $binary default false * TRUE if asked to return binary results (file download) @@ -23,7 +23,7 @@ require_once('include/Probe.php'); * @param integer $timeout Timeout in seconds, default system config value or 60 seconds * @param string $accept_content supply Accept: header with 'accept_content' as the value * @param string $cookiejar Path to cookie jar file - * + * * @return string The fetched content */ function fetch_url($url,$binary = false, &$redirects = 0, $timeout = 0, $accept_content=Null, $cookiejar = 0) { @@ -218,13 +218,13 @@ function z_fetch_url($url,$binary = false, &$redirects = 0, $opts=array()) { /** * @brief Post request to $url - * + * * @param string $url URL to post * @param mixed $params * @param string $headers HTTP headers * @param integer $redirects Recursion counter for internal use - default = 0 * @param integer $timeout The timeout in seconds, default system config value or 60 seconds - * + * * @return string The content */ function post_url($url,$params, $headers = null, &$redirects = 0, $timeout = 0) { @@ -385,10 +385,10 @@ function http_status_exit($val, $description = array()) { /** * @brief Check URL to se if ts's real - * + * * Take a URL from the wild, prepend http:// if necessary * and check DNS to see if it's real (or check if is a valid IP address) - * + * * @param string $url The URL to be validated * @return boolean True if it's a valid URL, fals if something wrong with it */ @@ -415,7 +415,7 @@ function validate_url(&$url) { /** * @brief Checks that email is an actual resolvable internet address - * + * * @param string $addr The email address * @return boolean True if it's a valid email address, false if it's not */ @@ -436,10 +436,10 @@ function validate_email($addr) { /** * @brief Check if URL is allowed - * + * * Check $url against our list of allowed sites, * wildcards allowed. If allowed_sites is unset return true; - * + * * @param string $url URL which get tested * @return boolean True if url is allowed otherwise return false */ @@ -481,9 +481,9 @@ function allowed_url($url) { /** * @brief Check if email address is allowed to register here. - * + * * Compare against our list (wildcards allowed). - * + * * @param type $email * @return boolean False if not allowed, true if allowed * or if allowed list is not configured @@ -821,7 +821,7 @@ function short_link($url) { /** * @brief Encodes content to json - * + * * This function encodes an array to json format * and adds an application/json HTTP header to the output. * After finishing the process is getting killed. From 432587464ce16dff513ed2de340fa3437dbe45aa Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Fri, 17 Feb 2017 22:35:46 -0500 Subject: [PATCH 2/4] Fix Diaspora link attachment probe - Move analytics param stripping out of original_url - Remove HEAD curl request in ParseUrl::getSiteInfo - Replace original_url with strip_tracking_query_params in ParseUrl::getSiteInfo to prevent massive curl fest in border cases --- include/ParseUrl.php | 26 ++--------------------- include/network.php | 49 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/include/ParseUrl.php b/include/ParseUrl.php index b85175a25..3a2fe9d53 100644 --- a/include/ParseUrl.php +++ b/include/ParseUrl.php @@ -130,7 +130,7 @@ class ParseUrl { $url = trim($url, "'"); $url = trim($url, '"'); - $url = original_url($url); + $url = strip_tracking_query_params($url); $siteinfo["url"] = $url; $siteinfo["type"] = "link"; @@ -142,8 +142,7 @@ class ParseUrl { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 1); - curl_setopt($ch, CURLOPT_NOBODY, 1); - curl_setopt($ch, CURLOPT_TIMEOUT, 3); + curl_setopt($ch, CURLOPT_TIMEOUT, 10); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); @@ -151,7 +150,6 @@ class ParseUrl { $header = curl_exec($ch); $curl_info = @curl_getinfo($ch); - $http_code = $curl_info["http_code"]; curl_close($ch); $a->save_timestamp($stamp1, "network"); @@ -197,26 +195,6 @@ class ParseUrl { } } - $stamp1 = microtime(true); - - // Now fetch the body as well - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_HEADER, 1); - curl_setopt($ch, CURLOPT_NOBODY, 0); - curl_setopt($ch, CURLOPT_TIMEOUT, 10); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); - curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false)); - - $header = curl_exec($ch); - $curl_info = @curl_getinfo($ch); - $http_code = $curl_info["http_code"]; - curl_close($ch); - - $a->save_timestamp($stamp1, "network"); - // Fetch the first mentioned charset. Can be in body or header $charset = ""; if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) { diff --git a/include/network.php b/include/network.php index 7385c94a0..ecbe0e5c6 100644 --- a/include/network.php +++ b/include/network.php @@ -670,42 +670,69 @@ function fix_contact_ssl_policy(&$contact,$new_policy) { } } -function original_url($url, $depth=1, $fetchbody = false) { - - $a = get_app(); - - // Remove Analytics Data from Google and other tracking platforms +/** + * @brief Remove Google Analytics and other tracking platforms params from URL + * + * @param string $url + * @return string + */ +function strip_tracking_query_params($url) +{ $urldata = parse_url($url); if (is_string($urldata["query"])) { $query = $urldata["query"]; parse_str($query, $querydata); - if (is_array($querydata)) - foreach ($querydata AS $param=>$value) + if (is_array($querydata)) { + foreach ($querydata AS $param => $value) { if (in_array($param, array("utm_source", "utm_medium", "utm_term", "utm_content", "utm_campaign", "wt_mc", "pk_campaign", "pk_kwd", "mc_cid", "mc_eid", "fb_action_ids", "fb_action_types", "fb_ref", "awesm", "wtrid", "woo_campaign", "woo_source", "woo_medium", "woo_content", "woo_term"))) { - $pair = $param."=".urlencode($value); + $pair = $param . "=" . urlencode($value); $url = str_replace($pair, "", $url); // Second try: if the url isn't encoded completely - $pair = $param."=".str_replace(" ", "+", $value); + $pair = $param . "=" . str_replace(" ", "+", $value); $url = str_replace($pair, "", $url); // Third try: Maybey the url isn't encoded at all - $pair = $param."=".$value; + $pair = $param . "=" . $value; $url = str_replace($pair, "", $url); $url = str_replace(array("?&", "&&"), array("?", ""), $url); } + } + } - if (substr($url, -1, 1) == "?") + if (substr($url, -1, 1) == "?") { $url = substr($url, 0, -1); + } } + return $url; +} + +/** + * @brief Returns the original URL of the provided URL + * + * This function strips tracking query params and follows redirections, either + * through HTTP code or meta refresh tags. Stops after 10 redirections. + * + * @see ParseUrl::getSiteinfo + * + * @param string $url + * @param int $depth + * @param bool $fetchbody + * @return string + */ +function original_url($url, $depth = 1, $fetchbody = false) { + $a = get_app(); + + $url = strip_tracking_query_params($url); + if ($depth > 10) return($url); From 2c959b925d20898579d4562d8d42669682de0957 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sat, 18 Feb 2017 20:35:31 -0500 Subject: [PATCH 3/4] Add param documentation --- include/network.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/network.php b/include/network.php index ecbe0e5c6..727d5e57e 100644 --- a/include/network.php +++ b/include/network.php @@ -673,8 +673,8 @@ function fix_contact_ssl_policy(&$contact,$new_policy) { /** * @brief Remove Google Analytics and other tracking platforms params from URL * - * @param string $url - * @return string + * @param string $url Any user-submitted URL that may contain tracking params + * @return string The same URL stripped of tracking parameters */ function strip_tracking_query_params($url) { From 58a444b4305bd8a1c2ab7ee172c3972e091cc964 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sat, 18 Feb 2017 20:39:16 -0500 Subject: [PATCH 4/4] Add original_url() param documentation --- include/network.php | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/network.php b/include/network.php index 727d5e57e..03f65a519 100644 --- a/include/network.php +++ b/include/network.php @@ -721,12 +721,14 @@ function strip_tracking_query_params($url) * This function strips tracking query params and follows redirections, either * through HTTP code or meta refresh tags. Stops after 10 redirections. * + * @todo Remove the $fetchbody parameter that generates an extraneous HEAD request + * * @see ParseUrl::getSiteinfo * - * @param string $url - * @param int $depth - * @param bool $fetchbody - * @return string + * @param string $url A user-submitted URL + * @param int $depth The current redirection recursion level (internal) + * @param bool $fetchbody Wether to fetch the body or not after the HEAD requests + * @return string A canonical URL */ function original_url($url, $depth = 1, $fetchbody = false) { $a = get_app();