From fd597472f15926336ff99b3661068c8e77be8878 Mon Sep 17 00:00:00 2001 From: Adam Magness Date: Thu, 8 Nov 2018 08:33:28 -0500 Subject: [PATCH] Split text.php to Strings class move functions to new strings class Split text.php to Strings class move functions to new Strings class --- include/text.php | 356 ------------------------------------- src/Util/Strings.php | 405 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 405 insertions(+), 356 deletions(-) create mode 100644 src/Util/Strings.php diff --git a/include/text.php b/include/text.php index 876cdbdb8..08858fe0f 100644 --- a/include/text.php +++ b/include/text.php @@ -31,139 +31,6 @@ use Friendica\Content\Text\HTML; require_once "include/conversation.php"; -/** - * @brief Generates a pseudo-random string of hexadecimal characters - * - * @param int $size - * @return string - */ -function random_string($size = 64) -{ - $byte_size = ceil($size / 2); - - $bytes = random_bytes($byte_size); - - $return = substr(bin2hex($bytes), 0, $size); - - return $return; -} - -/** - * This is our primary input filter. - * - * The high bit hack only involved some old IE browser, forget which (IE5/Mac?) - * that had an XSS attack vector due to stripping the high-bit on an 8-bit character - * after cleansing, and angle chars with the high bit set could get through as markup. - * - * This is now disabled because it was interfering with some legitimate unicode sequences - * and hopefully there aren't a lot of those browsers left. - * - * Use this on any text input where angle chars are not valid or permitted - * They will be replaced with safer brackets. This may be filtered further - * if these are not allowed either. - * - * @param string $string Input string - * @return string Filtered string - */ -function notags($string) { - return str_replace(["<", ">"], ['[', ']'], $string); - -// High-bit filter no longer used -// return str_replace(array("<",">","\xBA","\xBC","\xBE"), array('[',']','','',''), $string); -} - - -/** - * use this on "body" or "content" input where angle chars shouldn't be removed, - * and allow them to be safely displayed. - * @param string $string - * @return string - */ -function escape_tags($string) { - return htmlspecialchars($string, ENT_COMPAT, 'UTF-8', false); -} - - -/** - * generate a string that's random, but usually pronounceable. - * used to generate initial passwords - * @param int $len - * @return string - */ -function autoname($len) { - - if ($len <= 0) { - return ''; - } - - $vowels = ['a','a','ai','au','e','e','e','ee','ea','i','ie','o','ou','u']; - if (mt_rand(0, 5) == 4) { - $vowels[] = 'y'; - } - - $cons = [ - 'b','bl','br', - 'c','ch','cl','cr', - 'd','dr', - 'f','fl','fr', - 'g','gh','gl','gr', - 'h', - 'j', - 'k','kh','kl','kr', - 'l', - 'm', - 'n', - 'p','ph','pl','pr', - 'qu', - 'r','rh', - 's','sc','sh','sm','sp','st', - 't','th','tr', - 'v', - 'w','wh', - 'x', - 'z','zh' - ]; - - $midcons = ['ck','ct','gn','ld','lf','lm','lt','mb','mm', 'mn','mp', - 'nd','ng','nk','nt','rn','rp','rt']; - - $noend = ['bl', 'br', 'cl','cr','dr','fl','fr','gl','gr', - 'kh', 'kl','kr','mn','pl','pr','rh','tr','qu','wh','q']; - - $start = mt_rand(0,2); - if ($start == 0) { - $table = $vowels; - } else { - $table = $cons; - } - - $word = ''; - - for ($x = 0; $x < $len; $x ++) { - $r = mt_rand(0,count($table) - 1); - $word .= $table[$r]; - - if ($table == $vowels) { - $table = array_merge($cons,$midcons); - } else { - $table = $vowels; - } - - } - - $word = substr($word,0,$len); - - foreach ($noend as $noe) { - $noelen = strlen($noe); - if ((strlen($word) > $noelen) && (substr($word, -$noelen) == $noe)) { - $word = autoname($len); - break; - } - } - - return $word; -} - /** * Turn user/group ACLs stored as angle bracketed text into arrays * @@ -255,78 +122,6 @@ function activity_match($haystack,$needle) { return (($haystack === $needle) || ((basename($needle) === $haystack) && strstr($needle, NAMESPACE_ACTIVITY_SCHEMA))); } - -/** - * @brief Pull out all #hashtags and @person tags from $string. - * - * We also get @person@domain.com - which would make - * the regex quite complicated as tags can also - * end a sentence. So we'll run through our results - * and strip the period from any tags which end with one. - * Returns array of tags found, or empty array. - * - * @param string $string Post content - * @return array List of tag and person names - */ -function get_tags($string) { - $ret = []; - - // Convert hashtag links to hashtags - $string = preg_replace('/#\[url\=([^\[\]]*)\](.*?)\[\/url\]/ism', '#$2', $string); - - // ignore anything in a code block - $string = preg_replace('/\[code\](.*?)\[\/code\]/sm', '', $string); - - // Force line feeds at bbtags - $string = str_replace(['[', ']'], ["\n[", "]\n"], $string); - - // ignore anything in a bbtag - $string = preg_replace('/\[(.*?)\]/sm', '', $string); - - // Match full names against @tags including the space between first and last - // We will look these up afterward to see if they are full names or not recognisable. - - if (preg_match_all('/(@[^ \x0D\x0A,:?]+ [^ \x0D\x0A@,:?]+)([ \x0D\x0A@,:?]|$)/', $string, $matches)) { - foreach ($matches[1] as $match) { - if (strstr($match, ']')) { - // we might be inside a bbcode color tag - leave it alone - continue; - } - if (substr($match, -1, 1) === '.') { - $ret[] = substr($match, 0, -1); - } else { - $ret[] = $match; - } - } - } - - // Otherwise pull out single word tags. These can be @nickname, @first_last - // and #hash tags. - - if (preg_match_all('/([!#@][^\^ \x0D\x0A,;:?]+)([ \x0D\x0A,;:?]|$)/', $string, $matches)) { - foreach ($matches[1] as $match) { - if (strstr($match, ']')) { - // we might be inside a bbcode color tag - leave it alone - continue; - } - if (substr($match, -1, 1) === '.') { - $match = substr($match,0,-1); - } - // ignore strictly numeric tags like #1 - if ((strpos($match, '#') === 0) && ctype_digit(substr($match, 1))) { - continue; - } - // try not to catch url fragments - if (strpos($string, $match) && preg_match('/[a-zA-z0-9\/]/', substr($string, strpos($string, $match) - 1, 1))) { - continue; - } - $ret[] = $match; - } - } - return $ret; -} - - /** * quick and dirty quoted_printable encoding * @@ -337,45 +132,6 @@ function qp($s) { return str_replace("%", "=", rawurlencode($s)); } -/** - * @brief Check for a valid email string - * - * @param string $email_address - * @return boolean - */ -function valid_email($email_address) -{ - return preg_match('/^[_a-zA-Z0-9\-\+]+(\.[_a-zA-Z0-9\-\+]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)+$/', $email_address); -} - -/** - * Normalize url - * - * @param string $url - * @return string - */ -function normalise_link($url) { - $ret = str_replace(['https:', '//www.'], ['http:', '//'], $url); - return rtrim($ret,'/'); -} - - -/** - * Compare two URLs to see if they are the same, but ignore - * slight but hopefully insignificant differences such as if one - * is https and the other isn't, or if one is www.something and - * the other isn't - and also ignore case differences. - * - * @param string $a first url - * @param string $b second url - * @return boolean True if the URLs match, otherwise False - * - */ -function link_compare($a, $b) { - return (strcasecmp(normalise_link($a), normalise_link($b)) === 0); -} - - /** * @brief Find any non-embedded images in private items and add redir links to them * @@ -507,53 +263,6 @@ function return_bytes($size_str) { } } -/** - * @param string $s - * @param boolean $strip_padding - * @return string - */ -function base64url_encode($s, $strip_padding = false) { - - $s = strtr(base64_encode($s), '+/', '-_'); - - if ($strip_padding) { - $s = str_replace('=','',$s); - } - - return $s; -} - -/** - * @param string $s - * @return string - */ -function base64url_decode($s) { - - if (is_array($s)) { - Logger::log('base64url_decode: illegal input: ' . print_r(debug_backtrace(), true)); - return $s; - } - -/* - * // Placeholder for new rev of salmon which strips base64 padding. - * // PHP base64_decode handles the un-padded input without requiring this step - * // Uncomment if you find you need it. - * - * $l = strlen($s); - * if (!strpos($s,'=')) { - * $m = $l % 4; - * if ($m == 2) - * $s .= '=='; - * if ($m == 3) - * $s .= '='; - * } - * - */ - - return base64_decode(strtr($s,'-_','+/')); -} - - function bb_translate_video($s) { $matches = null; @@ -570,11 +279,6 @@ function bb_translate_video($s) { return $s; } -function normalise_openid($s) { - return trim(str_replace(['http://', 'https://'], ['', ''], $s), '/'); -} - - function undo_post_tagging($s) { $matches = null; $cnt = preg_match_all('/([!#@])\[url=(.*?)\](.*?)\[\/url\]/ism', $s, $matches, PREG_SET_ORDER); @@ -590,10 +294,6 @@ function undo_post_tagging($s) { return $s; } -function protect_sprintf($s) { - return str_replace('%', '%%', $s); -} - /// @TODO Rewrite this function is_a_date_arg($s) { $i = intval($s); @@ -612,59 +312,3 @@ function is_a_date_arg($s) { return false; } - -/** - * remove intentation from a text - */ -function deindent($text, $chr = "[\t ]", $count = NULL) { - $lines = explode("\n", $text); - - if (is_null($count)) { - $m = []; - $k = 0; - while ($k < count($lines) && strlen($lines[$k]) == 0) { - $k++; - } - preg_match("|^" . $chr . "*|", $lines[$k], $m); - $count = strlen($m[0]); - } - - for ($k = 0; $k < count($lines); $k++) { - $lines[$k] = preg_replace("|^" . $chr . "{" . $count . "}|", "", $lines[$k]); - } - - return implode("\n", $lines); -} - -function formatBytes($bytes, $precision = 2) { - $units = ['B', 'KB', 'MB', 'GB', 'TB']; - - $bytes = max($bytes, 0); - $pow = floor(($bytes ? log($bytes) : 0) / log(1024)); - $pow = min($pow, count($units) - 1); - - $bytes /= pow(1024, $pow); - - return round($bytes, $precision) . ' ' . $units[$pow]; -} - -/** - * @brief translate and format the networkname of a contact - * - * @param string $network - * Networkname of the contact (e.g. dfrn, rss and so on) - * @param sting $url - * The contact url - * @return string - */ -function format_network_name($network, $url = 0) { - if ($network != "") { - if ($url != "") { - $network_name = ''.ContactSelector::networkToName($network, $url).""; - } else { - $network_name = ContactSelector::networkToName($network); - } - - return $network_name; - } -} diff --git a/src/Util/Strings.php b/src/Util/Strings.php new file mode 100644 index 000000000..8df9f585d --- /dev/null +++ b/src/Util/Strings.php @@ -0,0 +1,405 @@ +"], ['[', ']'], $string); + } + + /** + * @brief Use this on "body" or "content" input where angle chars shouldn't be removed, + * and allow them to be safely displayed. + * @param string $string + * + * @return string + */ + public static function escapeTags($string) // escape_tags() + { + return htmlspecialchars($string, ENT_COMPAT, 'UTF-8', false); + } + + /** + * @brief Generate a string that's random, but usually pronounceable. Used to generate initial passwords + * + * @param int $len length + * + * @return string + */ + public static function getRandomName($len) // autoname() + { + if ($len <= 0) { + return ''; + } + + $vowels = ['a', 'a', 'ai', 'au', 'e', 'e', 'e', 'ee', 'ea', 'i', 'ie', 'o', 'ou', 'u']; + + if (mt_rand(0, 5) == 4) { + $vowels[] = 'y'; + } + + $cons = [ + 'b', 'bl', 'br', + 'c', 'ch', 'cl', 'cr', + 'd', 'dr', + 'f', 'fl', 'fr', + 'g', 'gh', 'gl', 'gr', + 'h', + 'j', + 'k', 'kh', 'kl', 'kr', + 'l', + 'm', + 'n', + 'p', 'ph', 'pl', 'pr', + 'qu', + 'r', 'rh', + 's' ,'sc', 'sh', 'sm', 'sp', 'st', + 't', 'th', 'tr', + 'v', + 'w', 'wh', + 'x', + 'z', 'zh' + ]; + + $midcons = ['ck', 'ct', 'gn', 'ld', 'lf', 'lm', 'lt', 'mb', 'mm', 'mn', 'mp', + 'nd', 'ng', 'nk', 'nt', 'rn', 'rp', 'rt']; + + $noend = ['bl', 'br', 'cl', 'cr', 'dr', 'fl', 'fr', 'gl', 'gr', + 'kh', 'kl', 'kr', 'mn', 'pl', 'pr', 'rh', 'tr', 'qu', 'wh', 'q']; + + $start = mt_rand(0, 2); + if ($start == 0) { + $table = $vowels; + } else { + $table = $cons; + } + + $word = ''; + + for ($x = 0; $x < $len; $x ++) { + $r = mt_rand(0, count($table) - 1); + $word .= $table[$r]; + + if ($table == $vowels) { + $table = array_merge($cons, $midcons); + } else { + $table = $vowels; + } + + } + + $word = substr($word, 0, $len); + + foreach ($noend as $noe) { + $noelen = strlen($noe); + if ((strlen($word) > $noelen) && (substr($word, -$noelen) == $noe)) { + $word = self::getRandomName($len); + break; + } + } + + return $word; + } + + /** + * @brief translate and format the networkname of a contact + * + * @param string $network Networkname of the contact (e.g. dfrn, rss and so on) + * @param string $url The contact url + * + * @return string Formatted network name + */ + public static function formatNetworkName($network, $url = 0) // format_network_name() + { + if ($network != "") { + if ($url != "") { + $network_name = ''.ContactSelector::networkToName($network, $url).""; + } else { + $network_name = ContactSelector::networkToName($network); + } + + return $network_name; + } + } + + /** + * @brief Remove intentation from a text + * + * @param string $text String to be transformed. + * @param string $chr Optional. Indentation tag. Default tab (\t). + * @param int $count Optional. Default null. + * + * @return string Transformed string. + */ + public static function deindent($text, $chr = "[\t ]", $count = NULL) + { + $lines = explode("\n", $text); + + if (is_null($count)) { + $m = []; + $k = 0; + while ($k < count($lines) && strlen($lines[$k]) == 0) { + $k++; + } + preg_match("|^" . $chr . "*|", $lines[$k], $m); + $count = strlen($m[0]); + } + + for ($k = 0; $k < count($lines); $k++) { + $lines[$k] = preg_replace("|^" . $chr . "{" . $count . "}|", "", $lines[$k]); + } + + return implode("\n", $lines); + } + + /** + * @brief Get byte size returned in a Data Measurement (KB, MB, GB) + * + * @param int $bytes The number of bytes to be measured + * @param int $precision Optional. Default 2. + * + * @return string Size with measured units. + */ + public static function formatBytes($bytes, $precision = 2) + { + $units = ['B', 'KB', 'MB', 'GB', 'TB']; + $bytes = max($bytes, 0); + $pow = floor(($bytes ? log($bytes) : 0) / log(1024)); + $pow = min($pow, count($units) - 1); + $bytes /= pow(1024, $pow); + + return round($bytes, $precision) . ' ' . $units[$pow]; + } + + /** + * @brief Protect percent characters in sprintf calls + * + * @param string $s String to transform. + * + * @return string Transformed string. + */ + public static function protectSprintf($s) // protect_sprintf() + { + return str_replace('%', '%%', $s); + } + + /** + * @brief Base64 Encode URL and translate +/ to -_ Optionally strip padding. + * + * @param string $s URL to encode + * @param boolean $strip_padding Optional. Default false + * + * @return string Encoded URL + */ + public static function base64UrlEncode($s, $strip_padding = false) //base64url_encode() + { + $s = strtr(base64_encode($s), '+/', '-_'); + + if ($strip_padding) { + $s = str_replace('=', '', $s); + } + + return $s; + } + + /** + * @brief Decode Base64 Encoded URL and translate -_ to +/ + * @param string $s URL to decode + * + * @return string Decoded URL + */ + public static function base64url_decode($s) // base64url_decode() + { + if (is_array($s)) { + Logger::log('base64url_decode: illegal input: ' . print_r(debug_backtrace(), true)); + return $s; + } + + /* + * // Placeholder for new rev of salmon which strips base64 padding. + * // PHP base64_decode handles the un-padded input without requiring this step + * // Uncomment if you find you need it. + * + * $l = strlen($s); + * if (!strpos($s,'=')) { + * $m = $l % 4; + * if ($m == 2) + * $s .= '=='; + * if ($m == 3) + * $s .= '='; + * } + * + */ + + return base64_decode(strtr($s, '-_', '+/')); + } + + /** + * @brief Pull out all #hashtags and @person tags from $string. + * + * We also get @person@domain.com - which would make + * the regex quite complicated as tags can also + * end a sentence. So we'll run through our results + * and strip the period from any tags which end with one. + * Returns array of tags found, or empty array. + * + * @param string $string Post content + * + * @return array List of tag and person names + */ + public static function getTags($string) // get_tags() + { + $ret = []; + + // Convert hashtag links to hashtags + $string = preg_replace('/#\[url\=([^\[\]]*)\](.*?)\[\/url\]/ism', '#$2', $string); + + // ignore anything in a code block + $string = preg_replace('/\[code\](.*?)\[\/code\]/sm', '', $string); + + // Force line feeds at bbtags + $string = str_replace(['[', ']'], ["\n[", "]\n"], $string); + + // ignore anything in a bbtag + $string = preg_replace('/\[(.*?)\]/sm', '', $string); + + // Match full names against @tags including the space between first and last + // We will look these up afterward to see if they are full names or not recognisable. + + if (preg_match_all('/(@[^ \x0D\x0A,:?]+ [^ \x0D\x0A@,:?]+)([ \x0D\x0A@,:?]|$)/', $string, $matches)) { + foreach ($matches[1] as $match) { + if (strstr($match, ']')) { + // we might be inside a bbcode color tag - leave it alone + continue; + } + + if (substr($match, -1, 1) === '.') { + $ret[] = substr($match, 0, -1); + } else { + $ret[] = $match; + } + } + } + + // Otherwise pull out single word tags. These can be @nickname, @first_last + // and #hash tags. + + if (preg_match_all('/([!#@][^\^ \x0D\x0A,;:?]+)([ \x0D\x0A,;:?]|$)/', $string, $matches)) { + foreach ($matches[1] as $match) { + if (strstr($match, ']')) { + // we might be inside a bbcode color tag - leave it alone + continue; + } + if (substr($match, -1, 1) === '.') { + $match = substr($match,0,-1); + } + // ignore strictly numeric tags like #1 + if ((strpos($match, '#') === 0) && ctype_digit(substr($match, 1))) { + continue; + } + // try not to catch url fragments + if (strpos($string, $match) && preg_match('/[a-zA-z0-9\/]/', substr($string, strpos($string, $match) - 1, 1))) { + continue; + } + $ret[] = $match; + } + } + + return $ret; + } + + /** + * @brief Check for a valid email string + * + * @param string $email_address Email address to be evaluated. + * + * @return boolean Value indicating whether or not the string is a valid email address. + */ + public static function isValidEmail($email_address) // valid_email() + { + return preg_match('/^[_a-zA-Z0-9\-\+]+(\.[_a-zA-Z0-9\-\+]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)+$/', $email_address); + } + + /** + * @brief Normalize url + * + * @param string $url URL to be normalized. + * + * @return string Normalized URL. + */ + public static function normaliseLink($url) // normalise_link() + { + $ret = str_replace(['https:', '//www.'], ['http:', '//'], $url); + return rtrim($ret, '/'); + } + + /** + * @brief Normalize OpenID identity + * + * @param string $s OpenID Identity + * + * @return string normalized OpenId Identity + */ + function normaliseOpenID($s) // normalize_openid() + { + return trim(str_replace(['http://', 'https://'], ['', ''], $s), '/'); + } + + /** + * @brief Compare two URLs to see if they are the same, but ignore + * slight but hopefully insignificant differences such as if one + * is https and the other isn't, or if one is www.something and + * the other isn't - and also ignore case differences. + * + * @param string $a first url + * @param string $b second url + * @return boolean True if the URLs match, otherwise False + * + */ + public static function compareLink($a, $b) // link_compare() + { + return (strcasecmp(normalise_link($a), normalise_link($b)) === 0); + } +}