Merge pull request #3171 from Hypolite/bug/fix-diaspora-attachment-links
Bug/fix diaspora attachment links
This commit is contained in:
		
				commit
				
					
						82ebc673a0
					
				
			
		
					 2 changed files with 70 additions and 63 deletions
				
			
		| 
						 | 
				
			
			@ -21,13 +21,13 @@ class ParseUrl {
 | 
			
		|||
 | 
			
		||||
	/**
 | 
			
		||||
	 * @brief Search for chached embeddable data of an url otherwise fetch it
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * @param type $url The url of the page which should be scraped
 | 
			
		||||
	 * @param type $no_guessing If true the parse doens't search for
 | 
			
		||||
	 *    preview pictures
 | 
			
		||||
	 * @param type $do_oembed The false option is used by the function fetch_oembed()
 | 
			
		||||
	 *    to avoid endless loops
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * @return array which contains needed data for embedding
 | 
			
		||||
	 *    string 'url' => The url of the parsed page
 | 
			
		||||
	 *    string 'type' => Content type
 | 
			
		||||
| 
						 | 
				
			
			@ -37,9 +37,9 @@ class ParseUrl {
 | 
			
		|||
	 *                if $no_geuessing = false
 | 
			
		||||
	 *    array'images' = Array of preview pictures
 | 
			
		||||
	 *    string 'keywords' => The tags which belong to the content
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * @see ParseUrl::getSiteinfo() for more information about scraping
 | 
			
		||||
	 * embeddable content 
 | 
			
		||||
	 * embeddable content
 | 
			
		||||
	 */
 | 
			
		||||
	public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) {
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -71,21 +71,21 @@ class ParseUrl {
 | 
			
		|||
	}
 | 
			
		||||
	/**
 | 
			
		||||
	 * @brief Parse a page for embeddable content information
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * This method parses to url for meta data which can be used to embed
 | 
			
		||||
	 * the content. If available it prioritizes Open Graph meta tags.
 | 
			
		||||
	 * If this is not available it uses the twitter cards meta tags.
 | 
			
		||||
	 * As fallback it uses standard html elements with meta informations
 | 
			
		||||
	 * like \<title\>Awesome Title\</title\> or
 | 
			
		||||
	 * \<meta name="description" content="An awesome description"\>
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * @param type $url The url of the page which should be scraped
 | 
			
		||||
	 * @param type $no_guessing If true the parse doens't search for
 | 
			
		||||
	 *    preview pictures
 | 
			
		||||
	 * @param type $do_oembed The false option is used by the function fetch_oembed()
 | 
			
		||||
	 *    to avoid endless loops
 | 
			
		||||
	 * @param type $count Internal counter to avoid endless loops
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * @return array which contains needed data for embedding
 | 
			
		||||
	 *    string 'url' => The url of the parsed page
 | 
			
		||||
	 *    string 'type' => Content type
 | 
			
		||||
| 
						 | 
				
			
			@ -95,13 +95,13 @@ class ParseUrl {
 | 
			
		|||
	 *                if $no_geuessing = false
 | 
			
		||||
	 *    array'images' = Array of preview pictures
 | 
			
		||||
	 *    string 'keywords' => The tags which belong to the content
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * @todo https://developers.google.com/+/plugins/snippet/
 | 
			
		||||
	 * @verbatim
 | 
			
		||||
	 * <meta itemprop="name" content="Awesome title">
 | 
			
		||||
	 * <meta itemprop="description" content="An awesome description">
 | 
			
		||||
	 * <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * <body itemscope itemtype="http://schema.org/Product">
 | 
			
		||||
	 *   <h1 itemprop="name">Shiny Trinket</h1>
 | 
			
		||||
	 *   <img itemprop="image" src="{image-url}" />
 | 
			
		||||
| 
						 | 
				
			
			@ -130,7 +130,7 @@ class ParseUrl {
 | 
			
		|||
		$url = trim($url, "'");
 | 
			
		||||
		$url = trim($url, '"');
 | 
			
		||||
 | 
			
		||||
		$url = original_url($url);
 | 
			
		||||
		$url = strip_tracking_query_params($url);
 | 
			
		||||
 | 
			
		||||
		$siteinfo["url"] = $url;
 | 
			
		||||
		$siteinfo["type"] = "link";
 | 
			
		||||
| 
						 | 
				
			
			@ -142,8 +142,7 @@ class ParseUrl {
 | 
			
		|||
		$ch = curl_init();
 | 
			
		||||
		curl_setopt($ch, CURLOPT_URL, $url);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_HEADER, 1);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_NOBODY, 1);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_TIMEOUT, 3);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_TIMEOUT, 10);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
 | 
			
		||||
		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false));
 | 
			
		||||
| 
						 | 
				
			
			@ -151,7 +150,6 @@ class ParseUrl {
 | 
			
		|||
 | 
			
		||||
		$header = curl_exec($ch);
 | 
			
		||||
		$curl_info = @curl_getinfo($ch);
 | 
			
		||||
		$http_code = $curl_info["http_code"];
 | 
			
		||||
		curl_close($ch);
 | 
			
		||||
 | 
			
		||||
		$a->save_timestamp($stamp1, "network");
 | 
			
		||||
| 
						 | 
				
			
			@ -197,26 +195,6 @@ class ParseUrl {
 | 
			
		|||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		$stamp1 = microtime(true);
 | 
			
		||||
 | 
			
		||||
		// Now fetch the body as well
 | 
			
		||||
		$ch = curl_init();
 | 
			
		||||
		curl_setopt($ch, CURLOPT_URL, $url);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_HEADER, 1);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_NOBODY, 0);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_TIMEOUT, 10);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 | 
			
		||||
		curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
 | 
			
		||||
		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false));
 | 
			
		||||
		curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false));
 | 
			
		||||
 | 
			
		||||
		$header = curl_exec($ch);
 | 
			
		||||
		$curl_info = @curl_getinfo($ch);
 | 
			
		||||
		$http_code = $curl_info["http_code"];
 | 
			
		||||
		curl_close($ch);
 | 
			
		||||
 | 
			
		||||
		$a->save_timestamp($stamp1, "network");
 | 
			
		||||
 | 
			
		||||
		// Fetch the first mentioned charset. Can be in body or header
 | 
			
		||||
		$charset = "";
 | 
			
		||||
		if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) {
 | 
			
		||||
| 
						 | 
				
			
			@ -476,7 +454,7 @@ class ParseUrl {
 | 
			
		|||
 | 
			
		||||
	/**
 | 
			
		||||
	 * @brief Convert tags from CSV to an array
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * @param string $string Tags
 | 
			
		||||
	 * @return array with formatted Hashtags
 | 
			
		||||
	 */
 | 
			
		||||
| 
						 | 
				
			
			@ -492,9 +470,9 @@ class ParseUrl {
 | 
			
		|||
 | 
			
		||||
	/**
 | 
			
		||||
	 * @brief Add a hasht sign to a string
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 *  This method is used as callback function
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * @param string $tag The pure tag name
 | 
			
		||||
	 * @param int $k Counter for internal use
 | 
			
		||||
	 */
 | 
			
		||||
| 
						 | 
				
			
			@ -504,16 +482,16 @@ class ParseUrl {
 | 
			
		|||
 | 
			
		||||
	/**
 | 
			
		||||
	 * @brief Add a scheme to an url
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * The src attribute of some html elements (e.g. images)
 | 
			
		||||
	 * can miss the scheme so we need to add the correct
 | 
			
		||||
	 * scheme
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * @param string $url The url which possibly does have
 | 
			
		||||
	 *    a missing scheme (a link to an image)
 | 
			
		||||
	 * @param string $scheme The url with a correct scheme
 | 
			
		||||
	 *    (e.g. the url from the webpage which does contain the image)
 | 
			
		||||
	 * 
 | 
			
		||||
	 *
 | 
			
		||||
	 * @return string The url with a scheme
 | 
			
		||||
	 */
 | 
			
		||||
	private static function completeUrl($url, $scheme) {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,11 +11,11 @@ require_once('include/Probe.php');
 | 
			
		|||
 | 
			
		||||
/**
 | 
			
		||||
 * @brief Curl wrapper
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * If binary flag is true, return binary results.
 | 
			
		||||
 * Set the cookiejar argument to a string (e.g. "/tmp/friendica-cookies.txt")
 | 
			
		||||
 * to preserve cookies from one request to the next.
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * @param string $url URL to fetch
 | 
			
		||||
 * @param boolean $binary default false
 | 
			
		||||
 *    TRUE if asked to return binary results (file download)
 | 
			
		||||
| 
						 | 
				
			
			@ -23,7 +23,7 @@ require_once('include/Probe.php');
 | 
			
		|||
 * @param integer $timeout Timeout in seconds, default system config value or 60 seconds
 | 
			
		||||
 * @param string $accept_content supply Accept: header with 'accept_content' as the value
 | 
			
		||||
 * @param string $cookiejar Path to cookie jar file
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * @return string The fetched content
 | 
			
		||||
 */
 | 
			
		||||
function fetch_url($url,$binary = false, &$redirects = 0, $timeout = 0, $accept_content=Null, $cookiejar = 0) {
 | 
			
		||||
| 
						 | 
				
			
			@ -218,13 +218,13 @@ function z_fetch_url($url,$binary = false, &$redirects = 0, $opts=array()) {
 | 
			
		|||
 | 
			
		||||
/**
 | 
			
		||||
 * @brief Post request to $url
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * @param string $url URL to post
 | 
			
		||||
 * @param mixed $params
 | 
			
		||||
 * @param string $headers HTTP headers
 | 
			
		||||
 * @param integer $redirects Recursion counter for internal use - default = 0
 | 
			
		||||
 * @param integer $timeout The timeout in seconds, default system config value or 60 seconds
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * @return string The content
 | 
			
		||||
 */
 | 
			
		||||
function post_url($url,$params, $headers = null, &$redirects = 0, $timeout = 0) {
 | 
			
		||||
| 
						 | 
				
			
			@ -385,10 +385,10 @@ function http_status_exit($val, $description = array()) {
 | 
			
		|||
 | 
			
		||||
/**
 | 
			
		||||
 * @brief Check URL to se if ts's real
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * Take a URL from the wild, prepend http:// if necessary
 | 
			
		||||
 * and check DNS to see if it's real (or check if is a valid IP address)
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * @param string $url The URL to be validated
 | 
			
		||||
 * @return boolean True if it's a valid URL, fals if something wrong with it
 | 
			
		||||
 */
 | 
			
		||||
| 
						 | 
				
			
			@ -415,7 +415,7 @@ function validate_url(&$url) {
 | 
			
		|||
 | 
			
		||||
/**
 | 
			
		||||
 * @brief Checks that email is an actual resolvable internet address
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * @param string $addr The email address
 | 
			
		||||
 * @return boolean True if it's a valid email address, false if it's not
 | 
			
		||||
 */
 | 
			
		||||
| 
						 | 
				
			
			@ -436,10 +436,10 @@ function validate_email($addr) {
 | 
			
		|||
 | 
			
		||||
/**
 | 
			
		||||
 * @brief Check if URL is allowed
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * Check $url against our list of allowed sites,
 | 
			
		||||
 * wildcards allowed. If allowed_sites is unset return true;
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * @param string $url URL which get tested
 | 
			
		||||
 * @return boolean True if url is allowed otherwise return false
 | 
			
		||||
 */
 | 
			
		||||
| 
						 | 
				
			
			@ -481,9 +481,9 @@ function allowed_url($url) {
 | 
			
		|||
 | 
			
		||||
/**
 | 
			
		||||
 * @brief Check if email address is allowed to register here.
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * Compare against our list (wildcards allowed).
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * @param type $email
 | 
			
		||||
 * @return boolean False if not allowed, true if allowed
 | 
			
		||||
 *    or if allowed list is not configured
 | 
			
		||||
| 
						 | 
				
			
			@ -670,42 +670,71 @@ function fix_contact_ssl_policy(&$contact,$new_policy) {
 | 
			
		|||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
function original_url($url, $depth=1, $fetchbody = false) {
 | 
			
		||||
 | 
			
		||||
	$a = get_app();
 | 
			
		||||
 | 
			
		||||
	// Remove Analytics Data from Google and other tracking platforms
 | 
			
		||||
/**
 | 
			
		||||
 * @brief Remove Google Analytics and other tracking platforms params from URL
 | 
			
		||||
 *
 | 
			
		||||
 * @param string $url Any user-submitted URL that may contain tracking params
 | 
			
		||||
 * @return string The same URL stripped of tracking parameters
 | 
			
		||||
 */
 | 
			
		||||
function strip_tracking_query_params($url)
 | 
			
		||||
{
 | 
			
		||||
	$urldata = parse_url($url);
 | 
			
		||||
	if (is_string($urldata["query"])) {
 | 
			
		||||
		$query = $urldata["query"];
 | 
			
		||||
		parse_str($query, $querydata);
 | 
			
		||||
 | 
			
		||||
		if (is_array($querydata))
 | 
			
		||||
			foreach ($querydata AS $param=>$value)
 | 
			
		||||
		if (is_array($querydata)) {
 | 
			
		||||
			foreach ($querydata AS $param => $value) {
 | 
			
		||||
				if (in_array($param, array("utm_source", "utm_medium", "utm_term", "utm_content", "utm_campaign",
 | 
			
		||||
							"wt_mc", "pk_campaign", "pk_kwd", "mc_cid", "mc_eid",
 | 
			
		||||
							"fb_action_ids", "fb_action_types", "fb_ref",
 | 
			
		||||
							"awesm", "wtrid",
 | 
			
		||||
							"woo_campaign", "woo_source", "woo_medium", "woo_content", "woo_term"))) {
 | 
			
		||||
 | 
			
		||||
					$pair = $param."=".urlencode($value);
 | 
			
		||||
					$pair = $param . "=" . urlencode($value);
 | 
			
		||||
					$url = str_replace($pair, "", $url);
 | 
			
		||||
 | 
			
		||||
					// Second try: if the url isn't encoded completely
 | 
			
		||||
					$pair = $param."=".str_replace(" ", "+", $value);
 | 
			
		||||
					$pair = $param . "=" . str_replace(" ", "+", $value);
 | 
			
		||||
					$url = str_replace($pair, "", $url);
 | 
			
		||||
 | 
			
		||||
					// Third try: Maybey the url isn't encoded at all
 | 
			
		||||
					$pair = $param."=".$value;
 | 
			
		||||
					$pair = $param . "=" . $value;
 | 
			
		||||
					$url = str_replace($pair, "", $url);
 | 
			
		||||
 | 
			
		||||
					$url = str_replace(array("?&", "&&"), array("?", ""), $url);
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if (substr($url, -1, 1) == "?")
 | 
			
		||||
		if (substr($url, -1, 1) == "?") {
 | 
			
		||||
			$url = substr($url, 0, -1);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return $url;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * @brief Returns the original URL of the provided URL
 | 
			
		||||
 *
 | 
			
		||||
 * This function strips tracking query params and follows redirections, either
 | 
			
		||||
 * through HTTP code or meta refresh tags. Stops after 10 redirections.
 | 
			
		||||
 *
 | 
			
		||||
 * @todo Remove the $fetchbody parameter that generates an extraneous HEAD request
 | 
			
		||||
 *
 | 
			
		||||
 * @see ParseUrl::getSiteinfo
 | 
			
		||||
 *
 | 
			
		||||
 * @param string $url A user-submitted URL
 | 
			
		||||
 * @param int $depth The current redirection recursion level (internal)
 | 
			
		||||
 * @param bool $fetchbody Wether to fetch the body or not after the HEAD requests
 | 
			
		||||
 * @return string A canonical URL
 | 
			
		||||
 */
 | 
			
		||||
function original_url($url, $depth = 1, $fetchbody = false) {
 | 
			
		||||
	$a = get_app();
 | 
			
		||||
 | 
			
		||||
	$url = strip_tracking_query_params($url);
 | 
			
		||||
 | 
			
		||||
	if ($depth > 10)
 | 
			
		||||
		return($url);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -821,7 +850,7 @@ function short_link($url) {
 | 
			
		|||
 | 
			
		||||
/**
 | 
			
		||||
 * @brief Encodes content to json
 | 
			
		||||
 * 
 | 
			
		||||
 *
 | 
			
		||||
 * This function encodes an array to json format
 | 
			
		||||
 * and adds an application/json HTTP header to the output.
 | 
			
		||||
 * After finishing the process is getting killed.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue