. WE DON'T WANT YOUR MONEY: NO TIPS NECESSARY! If you enjoy this plugin, a link to http://kingdesk.com from your website would be appreciated. For web design services, please contact jeff@kingdesk.com. */ # if used with multibyte language, UTF-8 encoding is required! class phpTypography { var $mb = FALSE; //cannot be changed after load var $chr = array(); var $settings = array(); // operational attributes var $parsedHTML = array(); // to hold current instance of class parseHTML var $parsedText = array(); // to hold current instance of class parseText #======================================================================= #======================================================================= #== METHODS - SET ATTRIBUTES #======================================================================= #======================================================================= // __ naming defines constructor that is automatically called on each newly-createy object function __construct($setDefaults = TRUE) { $this->chr["noBreakSpace"] = $this->uchr(160); $this->chr["noBreakNarrowSpace"] = $this->uchr(160); //should be 8239, but not supported consistently, used in unit spacing $this->chr["copyright"] = $this->uchr(169); $this->chr["guillemetOpen"] = $this->uchr(171); $this->chr["softHyphen"] = $this->uchr(173); $this->chr["registeredMark"] = $this->uchr(174); $this->chr["guillemetClose"] = $this->uchr(187); $this->chr["multiplication"] = $this->uchr(215); $this->chr["division"] = $this->uchr(247); $this->chr["figureSpace"] = $this->uchr(8199); $this->chr["thinSpace"] = $this->uchr(8201); $this->chr["zeroWidthSpace"] = $this->uchr(8203); $this->chr["hyphen"] = "-"; // should be $this->uchr(8208), but IE6 chokes; $this->chr["noBreakHyphen"] = $this->uchr(8209); $this->chr["enDash"] = $this->uchr(8211); $this->chr["emDash"] = $this->uchr(8212); $this->chr["singleQuoteOpen"] = $this->uchr(8216); // reset in set_smart_quotes_language() $this->chr["singleQuoteClose"] = $this->uchr(8217); // reset in set_smart_quotes_language() $this->chr["apostrophe"] = $this->uchr(8217); // defined seperate from singleQuoteClose so quotes can be redefined in set_smart_quotes_language() without disrupting apostrophies $this->chr["singleLow9Quote"] = $this->uchr(8218); $this->chr["doubleQuoteOpen"] = $this->uchr(8220); // reset in set_smart_quotes_language() $this->chr["doubleQuoteClose"] = $this->uchr(8221); // reset in set_smart_quotes_language() $this->chr["doubleLow9Quote"] = $this->uchr(8222); $this->chr["ellipses"] = $this->uchr(8230); $this->chr["singlePrime"] = $this->uchr(8242); $this->chr["doublePrime"] = $this->uchr(8243); $this->chr["singleAngleQuoteOpen"] = $this->uchr(8249); $this->chr["singleAngleQuoteClose"] = $this->uchr(8250); $this->chr["fractionSlash"] = $this->uchr(8260); $this->chr["soundCopyMark"] = $this->uchr(8471); $this->chr["serviceMark"] = $this->uchr(8480); $this->chr["tradeMark"] = $this->uchr(8482); $this->chr["minus"] = $this->uchr(8722); $this->chr["leftCornerBracket"] = $this->uchr(12300); $this->chr["rightCornerBracket"] = $this->uchr(12301); $this->chr["leftWhiteCornerBracket"] = $this->uchr(12302); $this->chr["rightWhiteCornerBracket"] = $this->uchr(12303); if($setDefaults) { $this->set_defaults(); } return TRUE; } function set_defaults() { // general attributes $this->set_tags_to_ignore(); $this->set_classes_to_ignore(); $this->set_ids_to_ignore(); //smart characters $this->set_smart_quotes(); //DEPRECIATED $this->set_smart_quotes_language(); $this->set_smart_quotes_primary(); /* added in version 1.15 */ $this->set_smart_quotes_secondary(); /* added in version 1.15 */ $this->set_smart_dashes(); $this->set_smart_ellipses(); $this->set_smart_diacritics(); $this->set_diacritic_language(); $this->set_diacritic_custom_replacements(); $this->set_smart_marks(); $this->set_smart_ordinal_suffix(); $this->set_smart_math(); $this->set_smart_fractions(); $this->set_smart_exponents(); // DEPRECIATED: $this->set_smart_multiplication(); //smart spacing $this->set_single_character_word_spacing(); $this->set_fraction_spacing(); $this->set_unit_spacing(); $this->set_units(); $this->set_dash_spacing(); $this->set_dewidow(); $this->set_max_dewidow_length(); $this->set_max_dewidow_pull(); $this->set_wrap_hard_hyphens(); $this->set_url_wrap(); $this->set_email_wrap(); $this->set_min_after_url_wrap(); $this->set_space_collapse(); //character styling $this->set_style_ampersands(); $this->set_style_caps(); $this->set_style_initial_quotes(); $this->set_style_numbers(); $this->set_initial_quote_tags(); //hyphenation $this->set_hyphenation(); $this->set_hyphenation_language(); $this->set_min_length_hyphenation(); $this->set_min_before_hyphenation(); $this->set_min_after_hyphenation(); $this->set_hyphenate_headings(); $this->set_hyphenate_all_caps(); $this->set_hyphenate_title_case(); // added in version 1.5 $this->set_hyphenation_exceptions(); return TRUE; } // sets tags where typography of children will be untouched function set_tags_to_ignore($tags = array("code", "head", "kbd", "object", "option", "pre", "samp", "script", "select", "style", "textarea", "title", "var", "math")) { if(!is_array($tags)) $tags = preg_split("/[\s,]+/", $tags, -1, PREG_SPLIT_NO_EMPTY); foreach($tags as &$tag){ $tag = strtolower($tag); } // self closing tags shouldn't be in $tags $selfClosingTags = array('area', 'base', 'basefont', 'br', 'frame', 'hr', 'img', 'input', 'link', 'meta'); $tagsCount = count($tags); // don't use foreach, we need to modify the array we are indexing through $key = 0; //we need to look through every initial key ($i), but the total key count will reduce over time ($key) for($i=0; $i<$tagsCount; $i++) { if(FALSE !== array_search($tags[$key], $selfClosingTags)) { $tags =array_merge(array_slice($tags, 0, $key), array_slice($tags, $key+1)); // array_merge renumbers numeric keys! $key--; //adjust for shorter array } $key++; } // include all inappropriate tags in $tags $inappropriateTags = array('iframe', 'textarea', 'button', 'select', 'optgroup', 'option' ,'map', 'style', 'head', 'title', 'script', 'applet', 'object', 'param'); foreach($inappropriateTags as $inappropriateTag) { if(FALSE === array_search($inappropriateTag, $tags)) { array_push($tags, $inappropriateTag); } } $this->settings["ignoreTags"] = $tags; return TRUE; } // sets classes where typography of children will be untouched function set_classes_to_ignore($classes = array("vcard", "noTypo")) { if(!is_array($classes)) $classes = preg_split("/[\s,]+/", $classes, -1, PREG_SPLIT_NO_EMPTY); $this->settings["ignoreClasses"] = $classes; return TRUE; } // sets IDs where typography of children will be untouched function set_ids_to_ignore($ids = array()) { if(!is_array($ids)) $ids = preg_split("/[\s,]+/", $ids, -1, PREG_SPLIT_NO_EMPTY); $this->settings["ignoreIDs"] = $ids; return TRUE; } // curl quotemarks function set_smart_quotes($on = TRUE) { $this->settings["smartQuotes"] = $on; return TRUE; } // DEPRECIATED // language preferences for curling quotemarks // allowed values for $lang // "en" = English style quotes, replaces "foo" with “foo” // "de" = German style quotes, replaces "foo" with „foo” // "fr" = French guillemets, replaces "foo" with «foo» // "fr-reverse" = Reverse French guillemets, replaces "foo" with »foo« function set_smart_quotes_language($lang = "en") { if($lang == "de") { $this->chr["doubleQuoteOpen"] = $this->chr["doubleLow9Quote"]; $this->chr["doubleQuoteClose"] = $this->uchr(8220); $this->chr["singleQuoteOpen"] = $this->chr["singleLow9Quote"]; $this->chr["singleQuoteClose"] = $this->uchr(8216); } elseif($lang == "fr") { $this->chr["doubleQuoteOpen"] = $this->chr["guillemetOpen"]; $this->chr["doubleQuoteClose"] = $this->chr["guillemetClose"]; $this->chr["singleQuoteOpen"] = $this->chr["singleAngleQuoteOpen"]; $this->chr["singleQuoteClose"] = $this->chr["singleAngleQuoteClose"]; } elseif($lang == "fr-reverse") { $this->chr["doubleQuoteOpen"] = $this->chr["guillemetClose"]; $this->chr["doubleQuoteClose"] = $this->chr["guillemetOpen"]; $this->chr["singleQuoteOpen"] = $this->chr["singleAngleQuoteClose"]; $this->chr["singleQuoteClose"] = $this->chr["singleAngleQuoteOpen"]; } else { $this->chr["doubleQuoteOpen"] = $this->uchr(8220); $this->chr["doubleQuoteClose"] = $this->uchr(8221); $this->chr["singleQuoteOpen"] = $this->uchr(8216); $this->chr["singleQuoteClose"] = $this->uchr(8217); } return TRUE; } // Primary quotemarks style // allowed values for $style // "doubleCurled" => "“foo”", // "doubleCurledReversed" => "”foo”", // "doubleLow9" => "„foo”", // "doubleLow9Reversed" => "„foo“", // "singleCurled" => "‘foo’", // "singleCurledReversed" => "’foo’", // "singleLow9" => "‚foo’", // "singleLow9Reversed" => "‚foo‘", // "doubleGuillemetsFrench" => "« foo »", // "doubleGuillemets" => "«foo»", // "doubleGuillemetsReversed" => "»foo«", // "singleGuillemets" => "‹foo›", // "singleGuillemetsReversed" => "›foo‹", // "cornerBrackets" => "「foo」", // "whiteCornerBracket" => "『foo』", function set_smart_quotes_primary($style = "doubleCurled") { if($style == "doubleCurled") { $this->chr["doubleQuoteOpen"] = $this->uchr(8220); $this->chr["doubleQuoteClose"] = $this->uchr(8221); } elseif($style == "doubleCurledReversed") { $this->chr["doubleQuoteOpen"] = $this->uchr(8221); $this->chr["doubleQuoteClose"] = $this->uchr(8221); } elseif($style == "doubleLow9") { $this->chr["doubleQuoteOpen"] = $this->chr["doubleLow9Quote"]; $this->chr["doubleQuoteClose"] = $this->uchr(8221); } elseif($style == "doubleLow9Reversed") { $this->chr["doubleQuoteOpen"] = $this->chr["doubleLow9Quote"]; $this->chr["doubleQuoteClose"] = $this->uchr(8220); } elseif($style == "singleCurled") { $this->chr["doubleQuoteOpen"] = $this->uchr(8216); $this->chr["doubleQuoteClose"] = $this->uchr(8217); } elseif($style == "singleCurledReversed") { $this->chr["doubleQuoteOpen"] = $this->uchr(8217); $this->chr["doubleQuoteClose"] = $this->uchr(8217); } elseif($style == "singleLow9") { $this->chr["doubleQuoteOpen"] = $this->chr["singleLow9Quote"]; $this->chr["doubleQuoteClose"] = $this->uchr(8217); } elseif($style == "singleLow9Reversed") { $this->chr["doubleQuoteOpen"] = $this->chr["singleLow9Quote"]; $this->chr["doubleQuoteClose"] = $this->uchr(8216); } elseif($style == "doubleGuillemetsFrench") { $this->chr["doubleQuoteOpen"] = $this->chr["guillemetOpen"].$this->chr["noBreakSpace"]; $this->chr["doubleQuoteClose"] = $this->chr["noBreakSpace"].$this->chr["guillemetClose"]; } elseif($style == "doubleGuillemets") { $this->chr["doubleQuoteOpen"] = $this->chr["guillemetOpen"]; $this->chr["doubleQuoteClose"] = $this->chr["guillemetClose"]; } elseif($style == "doubleGuillemetsReversed") { $this->chr["doubleQuoteOpen"] = $this->chr["guillemetClose"]; $this->chr["doubleQuoteClose"] = $this->chr["guillemetOpen"]; } elseif($style == "singleGuillemets") { $this->chr["doubleQuoteOpen"] = $this->chr["singleAngleQuoteOpen"]; $this->chr["doubleQuoteClose"] = $this->chr["singleAngleQuoteClose"]; } elseif($style == "singleGuillemetsReversed") { $this->chr["doubleQuoteOpen"] = $this->chr["singleAngleQuoteClose"]; $this->chr["doubleQuoteClose"] = $this->chr["singleAngleQuoteOpen"]; } elseif($style == "cornerBrackets") { $this->chr["doubleQuoteOpen"] = $this->chr["leftCornerBracket"]; $this->chr["doubleQuoteClose"] = $this->chr["rightCornerBracket"]; } elseif($style == "whiteCornerBracket") { $this->chr["doubleQuoteOpen"] = $this->chr["leftWhiteCornerBracket"]; $this->chr["doubleQuoteClose"] = $this->chr["rightWhiteCornerBracket"]; } else { $this->chr["doubleQuoteOpen"] = $this->uchr(8220); $this->chr["doubleQuoteClose"] = $this->uchr(8221); } return TRUE; } // Secondary quotemarks style // allowed values for $style // "doubleCurled" => "“foo”", // "doubleCurledReversed" => "”foo”", // "doubleLow9" => "„foo”", // "doubleLow9Reversed" => "„foo“", // "singleCurled" => "‘foo’", // "singleCurledReversed" => "’foo’", // "singleLow9" => "‚foo’", // "singleLow9Reversed" => "‚foo‘", // "doubleGuillemetsFrench" => "« foo »", // "doubleGuillemets" => "«foo»", // "doubleGuillemetsReversed" => "»foo«", // "singleGuillemets" => "‹foo›", // "singleGuillemetsReversed" => "›foo‹", // "cornerBrackets" => "「foo」", // "whiteCornerBracket" => "『foo』", function set_smart_quotes_secondary($style = "singleCurled") { if($style == "doubleCurled") { $this->chr["singleQuoteOpen"] = $this->uchr(8220); $this->chr["singleQuoteClose"] = $this->uchr(8221); } elseif($style == "doubleCurledReversed") { $this->chr["singleQuoteOpen"] = $this->uchr(8221); $this->chr["singleQuoteClose"] = $this->uchr(8221); } elseif($style == "doubleLow9") { $this->chr["singleQuoteOpen"] = $this->chr["doubleLow9Quote"]; $this->chr["singleQuoteClose"] = $this->uchr(8221); } elseif($style == "doubleLow9Reversed") { $this->chr["singleQuoteOpen"] = $this->chr["doubleLow9Quote"]; $this->chr["singleQuoteClose"] = $this->uchr(8220); } elseif($style == "singleCurled") { $this->chr["singleQuoteOpen"] = $this->uchr(8216); $this->chr["singleQuoteClose"] = $this->uchr(8217); } elseif($style == "singleCurledReversed") { $this->chr["singleQuoteOpen"] = $this->uchr(8217); $this->chr["singleQuoteClose"] = $this->uchr(8217); } elseif($style == "singleLow9") { $this->chr["singleQuoteOpen"] = $this->chr["singleLow9Quote"]; $this->chr["singleQuoteClose"] = $this->uchr(8217); } elseif($style == "singleLow9Reversed") { $this->chr["singleQuoteOpen"] = $this->chr["singleLow9Quote"]; $this->chr["singleQuoteClose"] = $this->uchr(8216); } elseif($style == "doubleGuillemetsFrench") { $this->chr["singleQuoteOpen"] = $this->chr["guillemetOpen"].$this->chr["noBreakSpace"]; $this->chr["singleQuoteClose"] = $this->chr["noBreakSpace"].$this->chr["guillemetClose"]; } elseif($style == "doubleGuillemets") { $this->chr["singleQuoteOpen"] = $this->chr["guillemetOpen"]; $this->chr["singleQuoteClose"] = $this->chr["guillemetClose"]; } elseif($style == "doubleGuillemetsReversed") { $this->chr["singleQuoteOpen"] = $this->chr["guillemetClose"]; $this->chr["singleQuoteClose"] = $this->chr["guillemetOpen"]; } elseif($style == "singleGuillemets") { $this->chr["singleQuoteOpen"] = $this->chr["singleAngleQuoteOpen"]; $this->chr["singleQuoteClose"] = $this->chr["singleAngleQuoteClose"]; } elseif($style == "singleGuillemetsReversed") { $this->chr["singleQuoteOpen"] = $this->chr["singleAngleQuoteClose"]; $this->chr["singleQuoteClose"] = $this->chr["singleAngleQuoteOpen"]; } elseif($style == "cornerBrackets") { $this->chr["singleQuoteOpen"] = $this->chr["leftCornerBracket"]; $this->chr["singleQuoteClose"] = $this->chr["rightCornerBracket"]; } elseif($style == "whiteCornerBracket") { $this->chr["singleQuoteOpen"] = $this->chr["leftWhiteCornerBracket"]; $this->chr["singleQuoteClose"] = $this->chr["rightWhiteCornerBracket"]; } else { $this->chr["singleQuoteOpen"] = $this->uchr(8216); $this->chr["singleQuoteClose"] = $this->uchr(8217); } return TRUE; } // replaces "a--a" with En Dash " -- " and "---" with Em Dash function set_smart_dashes($on = TRUE) { $this->settings["smartDashes"] = $on; return TRUE; } // replaces "..." with "…" function set_smart_ellipses($on = TRUE) { $this->settings["smartEllipses"] = $on; return TRUE; } // replaces "creme brulee" with "crème brûlée" function set_smart_diacritics($on = TRUE) { $this->settings["smartDiacritics"] = $on; return TRUE; } // defines hyphenation language for text function set_diacritic_language($lang = "en-US") { if (isset($this->settings["diacriticLanguage"]) && $this->settings["diacriticLanguage"] == $lang) return TRUE; $this->settings["diacriticLanguage"] = $lang; if(file_exists(dirname(__FILE__).'/diacritics/'.$this->settings["diacriticLanguage"].'.php')) { include('diacritics/'.$this->settings["diacriticLanguage"].'.php'); } else { include('diacritics/en-US.php'); } $this->settings["diacriticWords"] = $diacriticWords; return TRUE; } // $customReplacements must be // an array formatted array(needle=>replacement, needle=>replacement...), or // a string formatted `"needle"=>"replacement","needle"=>"replacement",...` function set_diacritic_custom_replacements($customReplacements = array()) { $replacements = array(); if(!is_array($customReplacements)) $customReplacements = preg_split("/,/", $customReplacements, -1, PREG_SPLIT_NO_EMPTY); foreach($customReplacements as $customReplacement) { //account for single and double quotes preg_match("/(?:\")([^\"]+)(?:\"\s*=>)/", $customReplacement, $doubleQuoteKeyMatch); preg_match("/(?:')([^']+)(?:'\s*=>)/", $customReplacement, $singleQuoteKeyMatch); preg_match("/(?:=>\s*\")([^\"]+)(?:\")/", $customReplacement, $doubleQuoteValueMatch); preg_match("/(?:=>\s*')([^']+)(?:')/", $customReplacement, $singleQuoteValueMatch); if( isset($doubleQuoteKeyMatch[1]) && ( $doubleQuoteKeyMatch[1] != "" ) ) { $key = $doubleQuoteKeyMatch[1]; } elseif( isset($singleQuoteKeyMatch[1]) && ( $singleQuoteKeyMatch[1] != "" ) ) { $key = $singleQuoteKeyMatch[1]; } if( isset($doubleQuoteValueMatch[1]) && ( $doubleQuoteValueMatch[1] != "" ) ) { $value = $doubleQuoteValueMatch[1]; } elseif( isset($singleQuoteValueMatch[1]) && ( $singleQuoteValueMatch[1] != "" ) ) { $value = $singleQuoteValueMatch[1]; } if( isset($key) && isset($value) ) { $replacements[strip_tags(trim($key))] = strip_tags(trim($value)); } } $this->settings["diacriticCustomReplacements"] = $replacements; return TRUE; } // replaces (r) (c) (tm) (sm) (p) (R) (C) (TM) (SM) (P) with ® © ™ ℠ ℗ function set_smart_marks($on = TRUE) { $this->settings["smartMarks"] = $on; return TRUE; } // replaces 1/4 with 14 function set_smart_math($on = TRUE) { $this->settings["smartMath"] = $on; return TRUE; } // replaces 1/4 with 14 function set_smart_exponents($on = TRUE) { $this->settings["smartExponents"] = $on; return TRUE; } // replaces 1/4 with 14 function set_smart_fractions($on = TRUE) { $this->settings["smartFractions"] = $on; return TRUE; } // DEPRECIATED function set_smart_multiplication($on = TRUE) { $this->settings["smartMath"] = $on; return TRUE; } // wrap numbers in function set_smart_ordinal_suffix($on = TRUE) { $this->settings["smartOrdinalSuffix"] = $on; return TRUE; } // single character words are forced to next line with insertion of   function set_single_character_word_spacing($on = TRUE) { $this->settings["singleCharacterWordSpacing"] = $on; return TRUE; } // units and values are kept together with insertion of   function set_fraction_spacing($on = TRUE) { $this->settings["fractionSpacing"] = $on; return TRUE; } // units and values are kept together with insertion of   function set_unit_spacing($on = TRUE) { $this->settings["unitSpacing"] = $on; return TRUE; } // a list of units to keep with their values function set_units($units = array()) { if(!is_array($units)) $units = preg_split("/[\s,]+/", $units, -1, PREG_SPLIT_NO_EMPTY); $this->settings["units"] = $units; return TRUE; } // Em and En dashes are wrapped in thin spaces function set_dash_spacing($on = TRUE) { $this->settings["dashSpacing"] = $on; return TRUE; } // Remove extra space Characters function set_space_collapse($on = TRUE) { $this->settings["spaceCollapse"] = $on; return TRUE; } // enables widow handling function set_dewidow($on = TRUE) { $this->settings["dewidow"] = $on; return TRUE; } // establishes maximum length of a widows that will be protected function set_max_dewidow_length($len = 5) { $len = ($len > 1) ? $len : 5; $this->settings["dewidowMaxLength"] = $len; return TRUE; } // establishes maximum length of pulled text to keep widows company function set_max_dewidow_pull($len = 5) { $len = ($len > 1) ? $len : 5; $this->settings["dewidowMaxPull"] = $len; return TRUE; } // enables wrapping at hard hyphens internal to a word with the insertion of a zero-width-space function set_wrap_hard_hyphens($on = TRUE) { $this->settings["hyphenHardWrap"] = $on; return TRUE; } // enables wrapping of urls function set_url_wrap($on = TRUE) { $this->settings["urlWrap"] = $on; return TRUE; } // enables wrapping of email addresses function set_email_wrap($on = TRUE) { $this->settings["emailWrap"] = $on; return TRUE; } // establishes minimum character requirement after a url wrapping point function set_min_after_url_wrap($len = 5) { $len = ($len > 0) ? $len : 5; $this->settings["urlMinAfterWrap"] = $len; return TRUE; } // wrap ampersands in function set_style_ampersands($on = TRUE) { $this->settings["styleAmpersands"] = $on; return TRUE; } // wrap caps in function set_style_caps($on = TRUE) { $this->settings["styleCaps"] = $on; return TRUE; } // wrap initial quotes in or function set_style_initial_quotes($on = TRUE) { $this->settings["styleInitialQuotes"] = $on; return TRUE; } // wrap numbers in function set_style_numbers($on = TRUE) { $this->settings["styleNumbers"] = $on; return TRUE; } // sets tags where initial quotes and guillemets should be styled function set_initial_quote_tags($tags = array("p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "li", "dd", "dt")) { if(!is_array($tags)) $tags = preg_split("/[^a-z0-9]+/", $tags, -1, PREG_SPLIT_NO_EMPTY); foreach($tags as &$tag){ $tag = strtolower($tag); } $this->settings["initialQuoteTags"] = $tags; return TRUE; } // enables hyphenation of text function set_hyphenation($on = TRUE) { $this->settings["hyphenation"] = $on; return TRUE; } // defines hyphenation language for text function set_hyphenation_language($lang = "en-US") { if (isset($this->settings["hyphenLanguage"]) && $this->settings["hyphenLanguage"] == $lang) return TRUE; $this->settings["hyphenLanguage"] = $lang; if(file_exists(dirname(__FILE__).'/lang/'.$this->settings["hyphenLanguage"].'.php')) { include('lang/'.$this->settings["hyphenLanguage"].'.php'); } else { include('lang/en-US.php'); } $this->settings["hyphenationPattern"] = $patgen; $this->settings["hyphenationPatternMaxSegment"] = $patgenMaxSeg; $this->settings["hyphenationPatternExceptions"] = $patgenExceptions; // make sure hyphenationExceptions is not set to force remerging of patgen and custom exceptions if(isset($this->settings["hyphenationExceptions"])) unset($this->settings["hyphenationExceptions"]); return TRUE; } // establishes minimum length of a word that may be hyphenated function set_min_length_hyphenation($len = 5) { $len = ($len > 1) ? $len : 5; $this->settings["hyphenMinLength"] = $len; return TRUE; } // establishes minimum character requirement before a hyphenation point function set_min_before_hyphenation($len = 3) { $len = ($len > 0) ? $len : 3; $this->settings["hyphenMinBefore"] = $len; return TRUE; } // establishes minimum character requirement after a hyphenation point function set_min_after_hyphenation($len = 2) { $len = ($len > 0) ? $len : 2; $this->settings["hyphenMinAfter"] = $len; return TRUE; } // allows/disallows hyphenation of title/heading text function set_hyphenate_headings($on = TRUE) { $this->settings["hyphenateTitle"] = $on; return TRUE; } // allows hyphenation of strings of all capital characters function set_hyphenate_all_caps($on = TRUE) { $this->settings["hyphenateAllCaps"] = $on; return TRUE; } // allows hyphenation of strings of all capital characters // added in version 1.5 function set_hyphenate_title_case($on = TRUE) { $this->settings["hyphenateTitleCase"] = $on; return TRUE; } // defines custom word hyphenations // expected input is an array of words with all hyphenation points marked with a hard hyphen function set_hyphenation_exceptions($exceptions = array()) { $encodings = array("ASCII","UTF-8", "ISO-8859-1"); $multibyte = FALSE; $u = ""; if(!is_array($exceptions)) $exceptions = preg_split("/[^a-zA-Z0-9\-]+/", $exceptions, -1, PREG_SPLIT_NO_EMPTY); $exceptionKeys = array(); foreach($exceptions as $key => &$exception) { $encoding = mb_detect_encoding($exception."a", $encodings); if("UTF-8" == $encoding) { $multibyte = TRUE; $u = "u"; if(!function_exists('mb_strlen')) return FALSE; } elseif("ASCII" == $encoding) { $multibyte = FALSE; } else { return FALSE; } if($multibyte) { $exception = mb_strtolower($exception, "UTF-8"); } else { //same as above without multibyte string functions to improve preformance $exception = strtolower($exception); } $exceptionKeys[$key] = preg_replace("#-#$u", "", $exception); } $e = array(); foreach($exceptionKeys as $key => $value) { $e[$value] = $exceptions[$key]; } $this->settings["hyphenationCustomExceptions"] = $e; // make sure hyphenationExceptions is not set to force remerging of patgen and custom exceptions if(isset($this->settings["hyphenationExceptions"])) unset($this->settings["hyphenationExceptions"]); return TRUE; } #======================================================================= #======================================================================= #== METHODS - ACTIONS, let's do something! #======================================================================= #======================================================================= # Returns: ARRAY of supported hyphenation languages in the form array( language code => language name) function get_languages() { $languages = array(); $langDir = dirname(__FILE__)."/lang/"; $handler = opendir($langDir); // read all files in directory while ($file = readdir($handler)) { // we only want the php files if (substr($file, -4) == ".php") { $fileContent = file_get_contents($langDir.$file); preg_match('/\$patgenLanguage\s*=\s*((".+")|(\'.+\'))\s*;/', $fileContent, $matches); $languageName = substr($matches[1], 1, -1); $languageCode = substr($file, 0, -4); $results[$languageCode] = $languageName; } } closedir($handler); asort($results); return $results; } # Returns: ARRAY of supported hyphenation languages in the form array( language code => language name) function get_diacritic_languages() { $languages = array(); $langDir = dirname(__FILE__)."/diacritics/"; $handler = opendir($langDir); // read all files in directory while ($file = readdir($handler)) { // we only want the php files if (substr($file, -4) == ".php") { $fileContent = file_get_contents($langDir.$file); preg_match('/\$diacriticLanguage\s*=\s*((".+")|(\'.+\'))\s*;/', $fileContent, $matches); $languageName = substr($matches[1], 1, -1); $languageCode = substr($file, 0, -4); $results[$languageCode] = $languageName; } } closedir($handler); asort($results); return $results; } # Action: modifies $html according to the defined settings # Returns: processed $html function process($html, $isTitle = FALSE) { if( isset($this->settings["ignoreTags"] ) && $isTitle && ( in_array('h1', $this->settings["ignoreTags"]) || in_array('h2', $this->settings["ignoreTags"]) ) ) return $html; require_once("php-parser/php-parser.php"); // parse the html $this->parsedHTML = new parseHTML(); $this->parsedHTML->load($html); $this->parsedHTML->unlock_text(); $tagsToIgnore = $this->parsedHTML->get_tags_by_name($this->settings["ignoreTags"]); if(isset($this->settings["ignoreClasses"])) $tagsToIgnore += $this->parsedHTML->get_tags_by_class($this->settings["ignoreClasses"]); //union to avoid dup keys if(isset($this->settings["ignoreIDs"])) $tagsToIgnore += $this->parsedHTML->get_tag_by_id($this->settings["ignoreIDs"]); //union to avoid dup keys $this->parsedHTML->lock_children($tagsToIgnore); $unlockedTexts = $this->parsedHTML->get_unlocked_text(); foreach($unlockedTexts as &$unlockedText) { // we won't be doing anything with spaces, so we can jump ship if that is all we have if (0 == strlen(trim($unlockedText["value"]))) continue; // decode all characters except < > & $unlockedText["value"] = html_entity_decode($unlockedText["value"], ENT_QUOTES, "UTF-8"); //converts all HTML entities to their applicable characters $unlockedText["value"] = htmlspecialchars($unlockedText["value"], ENT_NOQUOTES, "UTF-8"); //returns < > & to encoded HTML characters (< > and & respectively) // modify anything that requires adjacent text awareness here $unlockedText = $this->smart_math($unlockedText); $unlockedText = $this->smart_diacritics($unlockedText); $unlockedText = $this->smart_quotes($unlockedText); $unlockedText = $this->smart_dashes($unlockedText); $unlockedText = $this->smart_ellipses($unlockedText); $unlockedText = $this->smart_marks($unlockedText); //keep spacing after smart character replacement $unlockedText = $this->single_character_word_spacing($unlockedText); $unlockedText = $this->dash_spacing($unlockedText); $unlockedText = $this->unit_spacing($unlockedText); //break it down for a bit more granularity $this->parsedText = new parseText(); $this->parsedText->load($unlockedText); $parsedMixedWords = $this->parsedText->get_words(-1,0); // prohibit letter only words, allow caps $caps = (isset($this->settings["hyphenateAllCaps"]) && $this->settings["hyphenateAllCaps"]) ? 0 : -1 ; $parsedWords = $this->parsedText->get_words(1,$caps); // require letter only words, caps allowance in settingibutes; mutually exclusive with $parsedMixedWords $parsedOther = $this->parsedText->get_other(); // process individual text parts here $parsedMixedWords = $this->wrap_hard_hyphens($parsedMixedWords); $parsedWords = $this->hyphenate($parsedWords, $isTitle); $parsedOther = $this->wrap_urls($parsedOther); $parsedOther = $this->wrap_emails($parsedOther); //apply updates to unlockedText $this->parsedText->update($parsedMixedWords+$parsedWords+$parsedOther); $unlockedText = $this->parsedText->unload(); //some final space manipulation $unlockedText = $this->dewidow($unlockedText); $unlockedText = $this->space_collapse($unlockedText); //everything that requires HTML injection occurs here (functions above assume tag-free content) //pay careful attention to functions below for tolerance of injected tags $unlockedText = $this->smart_ordinal_suffix($unlockedText); // call before "style_numbers" and "smart_fractions" $unlockedText = $this->smart_exponents($unlockedText); // call before "style_numbers" $unlockedText = $this->smart_fractions($unlockedText); // call before "style_numbers" and after "smart_ordinal_suffix" if(!$this->parsedHTML->in_class('caps', $unlockedText)) $unlockedText = $this->style_caps($unlockedText); // call before "style_numbers" if(!$this->parsedHTML->in_class('numbers', $unlockedText)) $unlockedText = $this->style_numbers($unlockedText); // call after "smart_ordinal_suffix", "smart_exponents", "smart_fractions", and "style_caps" if(!$this->parsedHTML->in_class('amp', $unlockedText)) $unlockedText = $this->style_ampersands($unlockedText); if(!$this->parsedHTML->in_class(array('quo','dquo'), $unlockedText)) $unlockedText = $this->style_initial_quotes($unlockedText, $isTitle); } $this->parsedHTML->update($unlockedTexts); return $this->parsedHTML->unload(); } # Action: modifies $html according to the defined settings as only appropriate for RSS feeds # (i.e. excluding processes that may not display well with limited character set inteligence) # Returns: processed $html function process_feed($html, $isTitle = FALSE) { if( isset($this->settings["ignoreTags"]) && $isTitle && ( in_array('h1', $this->settings["ignoreTags"]) || in_array('h2', $this->settings["ignoreTags"]) ) ) return $html; require_once("php-parser/php-parser.php"); // parse the html $this->parsedHTML = new parseHTML(); $this->parsedHTML->load($html); $this->parsedHTML->unlock_text(); $tagsToIgnore = $this->parsedHTML->get_tags_by_name($this->settings["ignoreTags"]); if(isset($this->settings["ignoreClasses"])) $tagsToIgnore += $this->parsedHTML->get_tags_by_class($this->settings["ignoreClasses"]); //union to avoid dup keys if(isset($this->settings["ignoreIDs"])) $tagsToIgnore += $this->parsedHTML->get_tag_by_id($this->settings["ignoreIDs"]); //union to avoid dup keys $this->parsedHTML->lock_children($tagsToIgnore); $unlockedTexts = $this->parsedHTML->get_unlocked_text(); foreach($unlockedTexts as &$unlockedText) { // we won't be doing anything with spaces, so we can jump ship if that is all we have if (0 == strlen(trim($unlockedText["value"]))) continue; // decode all characters except < > & $unlockedText["value"] = html_entity_decode($unlockedText["value"], ENT_QUOTES, "UTF-8"); //converts all HTML entities to their applicable characters $unlockedText["value"] = htmlspecialchars($unlockedText["value"], ENT_NOQUOTES, "UTF-8"); //returns < > & to encoded HTML characters (< > and & respectively) // modify anything that requires adjacent text awareness here $unlockedText = $this->smart_quotes($unlockedText); $unlockedText = $this->smart_dashes($unlockedText); $unlockedText = $this->smart_ellipses($unlockedText); $unlockedText = $this->smart_marks($unlockedText); } // add $initialChrs and $widows back into $unlockedTexts; $this->parsedHTML->update($unlockedTexts); return $this->parsedHTML->unload(); } #======================================================================= #======================================================================= #== OTHER METHODS #======================================================================= #======================================================================= //expecting parsedHTML token of type text function smart_quotes($parsedHTMLtoken) { if(!isset($this->settings["smartQuotes"]) || !$this->settings["smartQuotes"]) return $parsedHTMLtoken; $nonEnglishWordCharacters = " [0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}| \x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}| \x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}| \x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}| \x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}| \x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}| \x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}| \x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}| \x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}| \x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}| \x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}| \x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}| \x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}| \x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}| \x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}| \x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}| \x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}| \x{017c}|\x{017d}|\x{017e}|\x{017f} "; //need to get context of adjacent characters outside adjacent inline tags or HTML comment //if we have adjacent characters add them to the text $nextChr = ""; $prevChr = ""; if(isset($parsedHTMLtoken["prevChr"]) && $parsedHTMLtoken["prevChr"] != "") { $prevChr = $parsedHTMLtoken["prevChr"]; $parsedHTMLtoken["value"] = $prevChr.$parsedHTMLtoken["value"]; } if(isset($parsedHTMLtoken["nextChr"]) && $parsedHTMLtoken["nextChr"] != "") { $nextChr = $parsedHTMLtoken["nextChr"]; $parsedHTMLtoken["value"] = $parsedHTMLtoken["value"].$nextChr; } ////Logic // before primes, handle quoted numbers $parsedHTMLtoken["value"] = preg_replace("/(?<=\W|\A)'(\d+)'(?=\W|\Z)/u", $this->chr["singleQuoteOpen"].'$1'.$this->chr["singleQuoteClose"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(?<=\W|\A)\"(\d+)\"(?=\W|\Z)/u", $this->chr["doubleQuoteOpen"].'$1'.$this->chr["doubleQuoteClose"], $parsedHTMLtoken["value"]); // guillemets $parsedHTMLtoken["value"] = str_replace("<<", $this->chr["guillemetOpen"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace("<<", $this->chr["guillemetOpen"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace(">>", $this->chr["guillemetClose"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace(">>", $this->chr["guillemetClose"], $parsedHTMLtoken["value"]); // primes $parsedHTMLtoken["value"] = preg_replace("/(\b\d+)''(?=\W|\Z)/u", '$1'.$this->chr["doublePrime"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(\b\d+)\"(?=\W|\Z)/u", '$1'.$this->chr["doublePrime"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(\b\d+)'(?=\W|\Z)/u", '$1'.$this->chr["singlePrime"], $parsedHTMLtoken["value"]); // backticks $parsedHTMLtoken["value"] = str_replace("``", $this->chr["doubleQuoteOpen"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace("`", $this->chr["singleQuoteOpen"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace("''", $this->chr["doubleQuoteClose"], $parsedHTMLtoken["value"]); // comma quotes $parsedHTMLtoken["value"] =str_replace(",,", $this->chr["doubleLow9Quote"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(?<=\s|\A),(?=\S)/", $this->chr["singleLow9Quote"], $parsedHTMLtoken["value"]); //like _,¿hola?'_ // apostrophes $parsedHTMLtoken["value"] = preg_replace("/(?<=[\w|$nonEnglishWordCharacters])'(?=[\w|$nonEnglishWordCharacters])/u", $this->chr["apostrophe"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/'(\d\d\b)/", $this->chr["apostrophe"].'$1', $parsedHTMLtoken["value"]); // decades: '98 $exceptions = array("'tain".$this->chr["apostrophe"]."t", "'twere", "'twas", "'tis", "'til", "'bout", "'nuff", "'round", "'cause", "'splainin"); $replacements = array($this->chr["apostrophe"]."tain".$this->chr["apostrophe"]."t", $this->chr["apostrophe"]."twere", $this->chr["apostrophe"]."twas", $this->chr["apostrophe"]."tis", $this->chr["apostrophe"]."til", $this->chr["apostrophe"]."bout", $this->chr["apostrophe"]."nuff", $this->chr["apostrophe"]."round", $this->chr["apostrophe"]."cause", $this->chr["apostrophe"]."splainin"); $parsedHTMLtoken["value"] = str_replace($exceptions, $replacements, $parsedHTMLtoken["value"]); //quotes $quoteRules = array("['", "{'", "('", "']", "'}", "')", "[\"", "{\"", "(\"", "\"]", "\"}", "\")", "\"'", "'\""); $quoteRulesReplace = array("[".$this->chr["singleQuoteOpen"], "{".$this->chr["singleQuoteOpen"], "(".$this->chr["singleQuoteOpen"], $this->chr["singleQuoteClose"]."]", $this->chr["singleQuoteClose"]."}", $this->chr["singleQuoteClose"].")", "[".$this->chr["doubleQuoteOpen"], "{".$this->chr["doubleQuoteOpen"], "(".$this->chr["doubleQuoteOpen"], $this->chr["doubleQuoteClose"]."]", $this->chr["doubleQuoteClose"]."}", $this->chr["doubleQuoteClose"].")", $this->chr["doubleQuoteOpen"].$this->chr["singleQuoteOpen"], $this->chr["singleQuoteClose"].$this->chr["doubleQuoteClose"]); $parsedHTMLtoken["value"] =str_replace($quoteRules, $quoteRulesReplace, $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/'(?=[\w|$nonEnglishWordCharacters])/u", $this->chr["singleQuoteOpen"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(?<=[\w|$nonEnglishWordCharacters])'/u", $this->chr["singleQuoteClose"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(?<=\s|\A)'(?=\S)/", $this->chr["singleQuoteOpen"], $parsedHTMLtoken["value"]); //like _'¿hola?'_ $parsedHTMLtoken["value"] = preg_replace("/(?<=\S)'(?=\s|\Z)/", $this->chr["singleQuoteClose"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/\"(?=[\w|$nonEnglishWordCharacters])/u", $this->chr["doubleQuoteOpen"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(?<=[\w|$nonEnglishWordCharacters])\"/u", $this->chr["doubleQuoteClose"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(?<=\s|\A)\"(?=\S)/", $this->chr["doubleQuoteOpen"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(?<=\S)\"(?=\s|\Z)/", $this->chr["doubleQuoteClose"], $parsedHTMLtoken["value"]); //quote catch-alls - assume left over quotes are closing - as this is often the most complicated position, thus most likely to be missed $parsedHTMLtoken["value"] = str_replace("'", $this->chr["singleQuoteClose"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace('"', $this->chr["doubleQuoteClose"], $parsedHTMLtoken["value"]); //if we have adjacent characters remove them from the text $encodings = array("ASCII","UTF-8"); $e = mb_detect_encoding($parsedHTMLtoken["value"]."a", $encodings);// ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936 if(!isset($e) || $e == "") $e = "ASCII"; if($prevChr != "") { $parsedHTMLtoken["value"] = mb_substr($parsedHTMLtoken["value"], 1, mb_strlen($parsedHTMLtoken["value"], $e), $e); } if($nextChr != "") { $parsedHTMLtoken["value"] = mb_substr($parsedHTMLtoken["value"], 0, mb_strlen($parsedHTMLtoken["value"], $e)-1, $e); } return $parsedHTMLtoken; } //expecting parsedHTML token of type text function smart_dashes($parsedHTMLtoken) { if(!isset($this->settings["smartDashes"]) || !$this->settings["smartDashes"]) return $parsedHTMLtoken; $nonEnglishWordCharacters = " [0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}| \x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}| \x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}| \x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}| \x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}| \x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}| \x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}| \x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}| \x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}| \x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}| \x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}| \x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}| \x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}| \x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}| \x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}| \x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}| \x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}| \x{017c}|\x{017d}|\x{017e}|\x{017f} "; $parsedHTMLtoken["value"] = str_replace("---", $this->chr["emDash"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace(" -- ", " ".$this->chr["emDash"]." ", $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace("--", $this->chr["enDash"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace(" - ", " ".$this->chr["emDash"]." ", $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(\A|\s)\-([\w|$nonEnglishWordCharacters])/u", '$1'.$this->chr["enDash"].'$2', $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/([\w|$nonEnglishWordCharacters])\-(\Z|\s)/u", '$1'.$this->chr["enDash"].'$2', $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(\b\d+)\-(\d+\b)/", '$1'.$this->chr["enDash"].'$2', $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = preg_replace("/(\b\d{3})".$this->chr["enDash"]."(\d{4}\b)/", '$1'.$this->chr["noBreakHyphen"].'$2', $parsedHTMLtoken["value"]); // phone numbers $parsedHTMLtoken["value"] = str_replace("xn".$this->chr["enDash"], "xn--", $parsedHTMLtoken["value"]); // revert dates back to original formats // YYYY-MM-DD $pattern = "/ ( (?<=\s|\A|".$this->chr["noBreakSpace"].") [12][0-9]{3} ) [\-".$this->chr["enDash"]."] ( (?:[0][1-9]|[1][0-2]) ) [\-".$this->chr["enDash"]."] ( (?:[0][1-9]|[12][0-9]|[3][0-1]) (?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].") ) /xu"; $parsedHTMLtoken["value"] = preg_replace($pattern, "$1-$2-$3", $parsedHTMLtoken["value"]); // MM-DD-YYYY or DD-MM-YYYY $pattern = "/ (?: (?: ( (?<=\s|\A|".$this->chr["noBreakSpace"].") (?:[0]?[1-9]|[1][0-2]) ) [\-".$this->chr["enDash"]."] ( (?:[0]?[1-9]|[12][0-9]|[3][0-1]) ) ) | (?: ( (?<=\s|\A|".$this->chr["noBreakSpace"].") (?:[0]?[1-9]|[12][0-9]|[3][0-1]) ) [\-".$this->chr["enDash"]."] ( (?:[0]?[1-9]|[1][0-2]) ) ) ) [\-".$this->chr["enDash"]."] ( [12][0-9]{3} (?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].") ) /xu"; $parsedHTMLtoken["value"] = preg_replace($pattern, "$1$3-$2$4-$5", $parsedHTMLtoken["value"]); // YYYY-MM or YYYY-DDDD next $pattern = "/ ( (?<=\s|\A|".$this->chr["noBreakSpace"].") [12][0-9]{3} ) [\-".$this->chr["enDash"]."] ( (?: (?:[0][1-9]|[1][0-2]) | (?:[0][0-9][1-9]|[1-2][0-9]{2}|[3][0-5][0-9]|[3][6][0-6]) ) (?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].") ) /xu"; $parsedHTMLtoken["value"] = preg_replace($pattern, "$1-$2", $parsedHTMLtoken["value"]); return $parsedHTMLtoken; } //expecting parsedHTML token of type text function smart_ellipses($parsedHTMLtoken) { if(!isset($this->settings["smartEllipses"]) || !$this->settings["smartEllipses"]) return $parsedHTMLtoken; $parsedHTMLtoken["value"] = str_replace(array("....", ". . . .",), ".".$this->chr["ellipses"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace(array("...", ". . .",), $this->chr["ellipses"], $parsedHTMLtoken["value"]); return $parsedHTMLtoken; } //expecting parsedHTML token of type text function smart_diacritics($parsedHTMLtoken) { if(!isset($this->settings["smartDiacritics"]) || !$this->settings["smartDiacritics"]) return $parsedHTMLtoken; if( isset($this->settings["diacriticCustomReplacements"]) && ( count($this->settings["diacriticCustomReplacements"]) > 0 ) ) { foreach($this->settings["diacriticCustomReplacements"] as $needle => $replacement) { $parsedHTMLtoken["value"] = preg_replace("/\b$needle\b/", $replacement, $parsedHTMLtoken["value"]); } } if( isset($this->settings["diacriticWords"]) && ( count($this->settings["diacriticWords"]) > 0 ) ) { foreach($this->settings["diacriticWords"] as $needle => $replacement) { $parsedHTMLtoken["value"] = preg_replace("/\b$needle\b/", $replacement, $parsedHTMLtoken["value"]); } } return $parsedHTMLtoken; } //expecting parsedHTML token of type text function smart_marks($parsedHTMLtoken) { if(!isset($this->settings["smartMarks"]) || !$this->settings["smartMarks"]) return $parsedHTMLtoken; $parsedHTMLtoken["value"] = str_replace(array("(c)", "(C)"), $this->chr["copyright"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace(array("(r)", "(R)"), $this->chr["registeredMark"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace(array("(p)", "(P)"), $this->chr["soundCopyMark"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace(array("(sm)", "(SM)"), $this->chr["serviceMark"], $parsedHTMLtoken["value"]); $parsedHTMLtoken["value"] = str_replace(array("(tm)", "(TM)"), $this->chr["tradeMark"], $parsedHTMLtoken["value"]); return $parsedHTMLtoken; } //expecting parsedHTML token of type text function smart_math($parsedHTMLtoken) { if(!isset($this->settings["smartMath"]) || !$this->settings["smartMath"]) return $parsedHTMLtoken; //first, let's find math equations $pattern = "/ (?<=\A|\s) # lookbehind assertion: proceeded by beginning of string or space [\.,\'\"\¿\¡".$this->chr["ellipses"].$this->chr["singleQuoteOpen"].$this->chr["doubleQuoteOpen"].$this->chr["guillemetOpen"].$this->chr["guillemetClose"].$this->chr["singleLow9Quote"].$this->chr["doubleLow9Quote"]."]* # allowed proceeding punctuation [\-\(".$this->chr["minus"]."]* # optionally proceeded by dash, minus sign or open parenthesis [0-9]+ # must begin with a number (\.[0-9]+)? # optionally allow decimal values after first integer ( # followed by a math symbol and a number [\/\*x\-+=\^".$this->chr["minus"].$this->chr["multiplication"].$this->chr["division"]."] # allowed math symbols [\-\(".$this->chr["minus"]."]* # opptionally preceeded by dash, minus sign or open parenthesis [0-9]+ # must begin with a number (\.[0-9]+)? # optionally allow decimal values after first integer [\-\(\)".$this->chr["minus"]."]* # opptionally preceeded by dash, minus sign or parenthesis )+ [\.,;:\'\"\?\!".$this->chr["ellipses"].$this->chr["singleQuoteClose"].$this->chr["doubleQuoteClose"].$this->chr["guillemetOpen"].$this->chr["guillemetClose"]."]* # allowed trailing punctuation (?=\Z|\s) # lookahead assertion: followed by end of string or space /ux"; $parsedHTMLtoken["value"] = preg_replace_callback( $pattern, array($this, '_smart_math_callback'), $parsedHTMLtoken["value"] ); // revert 4-4 to plain minus-hyphen so as to not mess with ranges of numbers (i.e. pp. 46-50) $pattern = "/ ( (?<=\s|\A|".$this->chr["noBreakSpace"].") \d+ ) [\-".$this->chr["minus"]."] ( \d+ (?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].") ) /xu"; $parsedHTMLtoken["value"] = preg_replace($pattern, "$1-$2", $parsedHTMLtoken["value"]); //revert fractions to basic slash // we'll leave styling fractions to smart_fractions $pattern = "/ ( (?<=\s|\A|\'|\"|".$this->chr["noBreakSpace"].") \d+ ) ".$this->chr["division"]." ( \d+ (?:st|nd|rd|th)? (?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].") ) /xu"; $parsedHTMLtoken["value"] = preg_replace($pattern, "$1/$2", $parsedHTMLtoken["value"]); // revert date back to original formats // YYYY-MM-DD $pattern = "/ ( (?<=\s|\A|".$this->chr["noBreakSpace"].") [12][0-9]{3} ) [\-".$this->chr["minus"]."] ( (?:[0]?[1-9]|[1][0-2]) ) [\-".$this->chr["minus"]."] ( (?:[0]?[1-9]|[12][0-9]|[3][0-1]) (?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].") ) /xu"; $parsedHTMLtoken["value"] = preg_replace($pattern, "$1-$2-$3", $parsedHTMLtoken["value"]); // MM-DD-YYYY or DD-MM-YYYY $pattern = "/ (?: (?: ( (?<=\s|\A|".$this->chr["noBreakSpace"].") (?:[0]?[1-9]|[1][0-2]) ) [\-".$this->chr["minus"]."] ( (?:[0]?[1-9]|[12][0-9]|[3][0-1]) ) ) | (?: ( (?<=\s|\A|".$this->chr["noBreakSpace"].") (?:[0]?[1-9]|[12][0-9]|[3][0-1]) ) [\-".$this->chr["minus"]."] ( (?:[0]?[1-9]|[1][0-2]) ) ) ) [\-".$this->chr["minus"]."] ( [12][0-9]{3} (?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].") ) /xu"; $parsedHTMLtoken["value"] = preg_replace($pattern, "$1$3-$2$4-$5", $parsedHTMLtoken["value"]); // YYYY-MM or YYYY-DDD next $pattern = "/ ( (?<=\s|\A|".$this->chr["noBreakSpace"].") [12][0-9]{3} ) [\-".$this->chr["minus"]."] ( (?: (?:[0][1-9]|[1][0-2]) | (?:[0][0-9][1-9]|[1-2][0-9]{2}|[3][0-5][0-9]|[3][6][0-6]) ) (?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].") ) /xu"; // MM/DD/YYYY or DD/MM/YYYY $pattern = "/ (?: (?: ( (?<=\s|\A|".$this->chr["noBreakSpace"].") (?:[0][1-9]|[1][0-2]) ) [\/".$this->chr["division"]."] ( (?:[0][1-9]|[12][0-9]|[3][0-1]) ) ) | (?: ( (?<=\s|\A|".$this->chr["noBreakSpace"].") (?:[0][1-9]|[12][0-9]|[3][0-1]) ) [\/".$this->chr["division"]."] ( (?:[0][1-9]|[1][0-2]) ) ) ) [\/".$this->chr["division"]."] ( [12][0-9]{3} (?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].") ) /xu"; $parsedHTMLtoken["value"] = preg_replace($pattern, "$1$3/$2$4/$5", $parsedHTMLtoken["value"]); return $parsedHTMLtoken; } function _smart_math_callback($matches) { $matches[0] = str_replace("-", $this->chr["minus"], $matches[0]); $matches[0] = str_replace("/", $this->chr["division"], $matches[0]); $matches[0] = str_replace("x", $this->chr["multiplication"], $matches[0]); $matches[0] = str_replace("*", $this->chr["multiplication"], $matches[0]); return $matches[0]; } //expecting parsedHTML token of type text // purposefully seperatred from smart_math because of HTML code injection function smart_exponents($parsedHTMLtoken) { if(!isset($this->settings["smartExponents"]) || !$this->settings["smartExponents"]) return $parsedHTMLtoken; //handle exponents (ie. 4^2) $pat = "/ \b (\d+) \^ (\w+) \b /xu"; $parsedHTMLtoken["value"] = preg_replace($pat, '$1$2', $parsedHTMLtoken["value"]); return $parsedHTMLtoken; } // expecting parsedHTML token of type text // call before sytle_numbers // call after smart_ordinal_suffix // purposefully seperatred from smart_math because of HTML code injection function smart_fractions($parsedHTMLtoken) { if((!isset($this->settings["smartFractions"]) || !$this->settings["smartFractions"]) && (!isset($this->settings["fractionSpacing"]) || !$this->settings["fractionSpacing"])) return $parsedHTMLtoken; $pat = "/\b(\d+)\s(\d+\s?\/\s?\d+)\b/"; if((isset($this->settings["fractionSpacing"]) && $this->settings["fractionSpacing"]) && (isset($this->settings["smartFractions"]) && $this->settings["smartFractions"])) { $parsedHTMLtoken["value"] = preg_replace($pat, '$1'.$this->chr["noBreakNarrowSpace"].'$2', $parsedHTMLtoken["value"]); } elseif((isset($this->settings["fractionSpacing"]) && $this->settings["fractionSpacing"]) && (!isset($this->settings["fractionSpacing"]) || !$this->settings["smartFractions"])) { $parsedHTMLtoken["value"] = preg_replace($pat, '$1'.$this->chr["noBreakSpace"].'$2', $parsedHTMLtoken["value"]); } if(isset($this->settings["smartFractions"]) && $this->settings["smartFractions"]) { // because without simple variables, the pattern fails... $nbsp = $this->chr['noBreakSpace']; $nbnsp = $this->chr['noBreakNarrowSpace']; $pat = "/ (?<=\A|\s|$nbsp|$nbnsp) # lookbehind assertion: makes sure we are not messing up a url (\d+) (?:\s?\/\s?".$this->chr["zeroWidthSpace"].") # strip out any zero-width spaces inserted by wrap_hard_hyphens (\d+) ( (?:\(?:st|nd|rd|th)<\/sup\>)? # handle ordinals after fractions (?:\Z|\s|$this->chr['noBreakSpace']|$this->chr['noBreakNarrowSpace']|\.|\!|\?|\)|\;|\:|\'|\") # makes sure we are not messing up a url ) /xu"; $parsedHTMLtoken["value"] = preg_replace($pat, '$1'.$this->chr["fractionSlash"].'$2$3', $parsedHTMLtoken["value"]); } return $parsedHTMLtoken; } //DEPRECIATED!! //expecting parsedHTML token of type text function smart_multiplication($parsedHTMLtoken) { return $this->smart_math($parsedHTMLtoken); } // expecting parsedHTML token of type text // call before sytle_numbers function smart_ordinal_suffix($parsedHTMLtoken) { if(!isset($this->settings["smartOrdinalSuffix"]) || !$this->settings["smartOrdinalSuffix"]) return $parsedHTMLtoken; $parsedHTMLtoken["value"] = preg_replace("/\b(\d+)(st|nd|rd|th)\b/", '$1'.'$2', $parsedHTMLtoken["value"]); return $parsedHTMLtoken; } //expecting parsedHTML token of type text function single_character_word_spacing($parsedHTMLtoken) { if(!isset($this->settings["singleCharacterWordSpacing"]) || !$this->settings["singleCharacterWordSpacing"]) return $parsedHTMLtoken; // add $nextChr and $prevChr for context $nextChr = ""; $prevChr = ""; if(isset($parsedHTMLtoken["prevChr"]) && $parsedHTMLtoken["prevChr"] != "") { $prevChr = $parsedHTMLtoken["prevChr"]; $parsedHTMLtoken["value"] = $prevChr.$parsedHTMLtoken["value"]; } if(isset($parsedHTMLtoken["nextChr"]) && $parsedHTMLtoken["nextChr"] != "") { $nextChr = $parsedHTMLtoken["nextChr"]; $parsedHTMLtoken["value"] = $parsedHTMLtoken["value"].$nextChr; } $parsedHTMLtoken["value"] = preg_replace( "/ (?: (\s) (\w) \s (?=\w) ) /xu", '$1$2'.$this->chr['noBreakSpace'], $parsedHTMLtoken["value"] ); //if we have adjacent characters remove them from the text $encodings = array("ASCII","UTF-8"); $e = mb_detect_encoding($parsedHTMLtoken["value"]."a", $encodings);// ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936 if(!isset($e) || $e == "") $e = "ASCII"; if($prevChr != "") { $parsedHTMLtoken["value"] = mb_substr($parsedHTMLtoken["value"], 1, mb_strlen($parsedHTMLtoken["value"], $e), $e); } if($nextChr != "") { $parsedHTMLtoken["value"] = mb_substr($parsedHTMLtoken["value"], 0, mb_strlen($parsedHTMLtoken["value"], $e)-1, $e); } return $parsedHTMLtoken; } //expecting parsedHTML token of type text function dash_spacing($parsedHTMLtoken) { if(!isset($this->settings["dashSpacing"]) || !$this->settings["dashSpacing"]) return $parsedHTMLtoken; $parsedHTMLtoken["value"] = preg_replace( "/ (?: \s (".$this->chr['emDash'].") \s ) | (?: (?<=\S) # lookbehind assertion (".$this->chr['emDash'].") (?=\S) # lookahead assertion ) /xu", $this->chr['thinSpace'].'$1$2'.$this->chr['thinSpace'], $parsedHTMLtoken["value"] ); $parsedHTMLtoken["value"] = preg_replace( "/ (?: \s (".$this->chr['enDash'].") \s ) | (?: (?<=\S) # lookbehind assertion (".$this->chr['enDash'].") (?=\S) # lookahead assertion ) /xu", $this->chr['thinSpace'].'$1$2'.$this->chr['thinSpace'], $parsedHTMLtoken["value"] ); return $parsedHTMLtoken; } //expecting parsedHTML token of type text function space_collapse($parsedHTMLtoken) { if(!isset($this->settings["spaceCollapse"]) || !$this->settings["spaceCollapse"]) return $parsedHTMLtoken; # find the HTML character representation for the following characters: # tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace # ogham space mark | en quad space | em quad space | en-space | three-per-em space # four-per-em space | six-per-em space | figure space | punctuation space | em-space # thin space | hair space | narrow no-break space # medium mathematical space | ideographic space # Some characters are used inside words, we will not count these as a space for the purpose # of finding word boundaries: # zero-width-space ("​", "​") # zero-width-joiner ("‌", "‌", "‍") # zero-width-non-joiner ("‍", "‍", "‌") $htmlSpaces = ' \x{00a0} # no-break space | \x{1361} # ethiopic wordspace | \x{2000} # en quad-space | \x{2001} # em quad-space | \x{2002} # en space | \x{2003} # em space | \x{2004} # three-per-em space | \x{2005} # four-per-em space | \x{2006} # six-per-em space | \x{2007} # figure space | \x{2008} # punctuation space | \x{2009} # thin space | \x{200a} # hair space | \x{200b} # zero-width space | \x{200c} # zero-width joiner | \x{200d} # zero-width non-joiner | \x{202f} # narrow no-break space | \x{205f} # medium mathematical space | \x{3000} # ideographic space '; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8) // normal spacing $parsedHTMLtoken["value"] = preg_replace( "/\s+/xu", " ", $parsedHTMLtoken["value"] ); // nbsp get's priority. if nbsp exists in a string of spaces, it collapses to nbsp $parsedHTMLtoken["value"] = preg_replace( "/(?:\s|$htmlSpaces)*".$this->chr["noBreakSpace"]."(?:\s|$htmlSpaces)*/xu", $this->chr["noBreakSpace"], $parsedHTMLtoken["value"] ); // for any other spaceing, replace with the first occurance of an unusual space character $parsedHTMLtoken["value"] = preg_replace( "/(?:\s)*($htmlSpaces)(?:\s|$htmlSpaces)*/xu", "$1", $parsedHTMLtoken["value"] ); // remove all spacing at beginning of block level elements if(!isset($parsedHTMLtoken["prevChr"]) || $parsedHTMLtoken["prevChr"] == NULL) { // we have the first text in a block level element $parsedHTMLtoken["value"] = preg_replace( "/\A(?:\s|$htmlSpaces)+/xu", "", $parsedHTMLtoken["value"] ); } /**/ return $parsedHTMLtoken; } //expecting parsedHTML token of type text function unit_spacing($parsedHTMLtoken) { if(!isset($this->settings["unitSpacing"]) || !$this->settings["unitSpacing"]) return $parsedHTMLtoken; $units = array(); if(isset($this->settings["units"])) { foreach($this->settings["units"] as $unit) { $units[] = preg_replace("#([\[\\\^\$\.\|\?\*\+\(\)\{\}])#", "\\\\$1", $unit ); // escape special chrs } } $customUnits = implode("|", $units); $customUnits .= ($customUnits) ? "|" : "" ; $unitPattern = $customUnits.' ### Temporal units (?:ms|s|secs?|mins?|hrs?)\.?| milliseconds?|seconds?|minutes?|hours?|days?|years?|decades?|century|centuries|millennium|millennia| ### Imperial units (?:in|ft|yd|mi)\.?| (?:ac|ha|oz|pt|qt|gal|lb|st)\.? s\.f\.|sf|s\.i\.|si|square[ ]feet|square[ ]foot| inch|inches|foot|feet|yards?|miles?|acres?|hectares?|ounces?|pints?|quarts?|gallons?|pounds?|stones?| ### Metric units (with prefixes) (?:p|µ|[mcdhkMGT])? (?:[mgstAKNJWCVFSTHBL]|mol|cd|rad|Hz|Pa|Wb|lm|lx|Bq|Gy|Sv|kat|Ω|Ohm|Ω|&\#0*937;|&\#[xX]0*3[Aa]9;)| (?:nano|micro|milli|centi|deci|deka|hecto|kilo|mega|giga|tera)? (?:liters?|meters?|grams?|newtons?|pascals?|watts?|joules?|amperes?)| ### Computers units (KB, Kb, TB, Kbps) [kKMGT]?(?:[oBb]|[oBb]ps|flops)| ### Money ¢|M?(?:£|¥|€|$)| ### Other units °[CF]? | %|pi|M?px|em|en|[NSEOW]|[NS][EOW]|mbar '; // required modifiers: x (multiline pattern) $parsedHTMLtoken["value"] = preg_replace("/(\d\.?)\s($unitPattern)\b/x", '$1'.$this->chr["noBreakNarrowSpace"].'$2', $parsedHTMLtoken["value"]); return $parsedHTMLtoken; } //expecting parsedHTML token of type text function wrap_hard_hyphens($parsedTextTokens) { if((isset($this->settings["hyphenHardWrap"]) && $this->settings["hyphenHardWrap"]) || (isset($this->settings["smartDashes"]) && $this->settings["smartDashes"])) { foreach($parsedTextTokens as &$parsedTextToken) { if(isset($this->settings["hyphenHardWrap"]) && $this->settings["hyphenHardWrap"]) { $hyphens = array('-',$this->chr["hyphen"]); $parsedTextToken["value"] = str_replace($hyphens, "-".$this->chr["zeroWidthSpace"], $parsedTextToken["value"]); $parsedTextToken["value"] = str_replace("_", "_".$this->chr["zeroWidthSpace"], $parsedTextToken["value"]); $parsedTextToken["value"] = str_replace("/", "/".$this->chr["zeroWidthSpace"], $parsedTextToken["value"]); } if(isset($this->settings["smartDashes"]) && $this->settings["smartDashes"]) // handled here because we need to know we are inside a word and not a url $parsedTextToken["value"] = str_replace("-", $this->chr["hyphen"], $parsedTextToken["value"]); } } return $parsedTextTokens; } //expecting parsedHTML token of type text function dewidow($parsedHTMLtoken) { // intervening inline tags may interfere with widow identification, but that is a sacrifice of using the parser // intervening tags will only interfere if they separate the widow from previous or preceding whitespace if(!isset($this->settings["dewidow"]) || !$this->settings["dewidow"]) return $parsedHTMLtoken; if(!isset($parsedHTMLtoken["nextChr"])) { // we have the last type "text" child of a block level element $encodings = array("ASCII","UTF-8", "ISO-8859-1"); $encoding = mb_detect_encoding($parsedHTMLtoken["value"]."a", $encodings); // ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936 $u = ''; if("UTF-8" == $encoding) { $u = "u"; if(!function_exists('mb_strlen')) return $parsedHTMLtoken; } elseif("ASCII" != $encoding) { return $parsedHTMLtoken; } $widowPattern = "/ (?: \A | (?: ( #subpattern 1: space before [\s".$this->chr["zeroWidthSpace"].$this->chr["softHyphen"]."]+ ) ( #subpattern 2: neighbors widow (short as possible) [^\s".$this->chr["zeroWidthSpace"].$this->chr["softHyphen"]."]+ ) ) ) ( #subpattern 3: space between [\s".$this->chr["noBreakSpace"]."]+ ) ( #subpattern 4: widow [^\s".$this->chr["zeroWidthSpace"]."]+? ) ( #subpattern 5: any trailing punctuation or spaces [^\w]* ) \Z /x$u"; $parsedHTMLtoken["value"] = preg_replace_callback( $widowPattern, array($this, '_dewidow_callback'), $parsedHTMLtoken["value"] ); } return $parsedHTMLtoken; } function _dewidow_callback($widow) { if(!isset($this->settings["dewidowMaxPull"]) || !$this->settings["dewidowMaxPull"] || !isset($this->settings["dewidowMaxLength"]) || !$this->settings["dewidowMaxLength"]) return $widow[0]; $encodings = array("ASCII","UTF-8", "ISO-8859-1"); $multibyte = FALSE; $encoding = mb_detect_encoding($widow[0]."a", $encodings); // ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936 if("UTF-8" == $encoding) $multibyte = TRUE; // if we are here, we know that widows are being protected in some fashion // with that, we will assert that widows should never be hyphenated or wrapped // as such, we will strip soft hyphens and zero-width-spaces $widow[4] = str_replace($this->chr["zeroWidthSpace"], "", $widow[4]); $widow[4] = str_replace($this->chr["softHyphen"], "", $widow[4]); // $widow[5] = preg_replace("/\s+/", $this->chr["noBreakSpace"], $widow[5]); $widow[5] = mb_ereg_replace("/\s+/", $this->chr["noBreakSpace"], $widow[5], "p");; // fixes multibyte unicode corruption that occurs in some instances in the line above. $widow[5] = str_replace($this->chr["zeroWidthSpace"], "", $widow[5]); $widow[5] = str_replace($this->chr["softHyphen"], "", $widow[5]); // eject if widows neighbor is proceeded by a no break space (the pulled text would be too long) if($widow[1] == "" || strstr($this->chr["noBreakSpace"], $widow[1])) return $widow[1].$widow[2].$widow[3].$widow[4].$widow[5]; if($multibyte) { // eject if widows neighbor length exceeds the max allowed or widow length exceeds max allowed if( ($widow[2] != "" && mb_strlen($widow[2]) > $this->settings["dewidowMaxPull"]) || mb_strlen($widow[4]) > $this->settings["dewidowMaxLength"] ) return $widow[1].$widow[2].$widow[3].$widow[4].$widow[5]; } else { // single byte version of previous if( ($widow[2] != "" && strlen($widow[2]) > $this->settings["dewidowMaxPull"]) || strlen($widow[4]) > $this->settings["dewidowMaxLength"] ) return $widow[1].$widow[2].$widow[3].$widow[4].$widow[5]; } // lets protect some widows! return $widow[1].$widow[2].$this->chr["noBreakSpace"].$widow[4].$widow[5]; } // expecting parsedText tokens function wrap_urls($parsedTextTokens) { if(!isset($this->settings["urlWrap"]) || !$this->settings["urlWrap"] || !isset($this->settings["urlMinAfterWrap"]) || !$this->settings["urlMinAfterWrap"]) return $parsedTextTokens; // test for and parse urls $validTLD = 'ac|ad|aero|ae|af|ag|ai|al|am|an|ao|aq|arpa|ar|asia|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|biz|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cat|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|com|coop|co|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|info|int|in|io|iq|ir|is|it|je|jm|jobs|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mobi|mo|mp|mq|mr|ms|mt|museum|mu|mv|mw|mx|my|mz|name|na|nc|net|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pro|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|travel|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw'; $urlScheme = '(?:https?|ftps?|file|nfs|feed|itms|itpc)'; $urlPattern = "(?: \A ($urlScheme:\/\/)? # Subpattern 1: contains _http://_ if it exists ( # Subpattern 2: contains subdomains.domain.tld (?: [a-z0-9] # first chr of (sub)domain can not be a hyphen [a-z0-9\-]{0,61} # middle chrs of (sub)domain may be a hyphen; # limit qty of middle chrs so total domain does not exceed 63 chrs [a-z0-9] # last chr of (sub)domain can not be a hyphen \. # dot separator )+ (?: $validTLD # validates top level domain ) (?: # optional port numbers : (?: [1-5]?[0-9]{1,4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 655[0-2][0-9] | 6553[0-5] ) )? ) ( # Subpattern 3: contains path following domain (?: \/ # marks nested directory [a-z0-9\"\$\-_\.\+!\*\'\(\),;\?:@=&\#]+ # valid characters within directory structure )* [\/]? # trailing slash if any ) \Z )"; // required modifiers: x (multiline pattern) i (case insensitive) foreach($parsedTextTokens as &$parsedTextToken) { if(preg_match("`$urlPattern`xi", $parsedTextToken["value"], $urlMatch)) { // $urlMatch[1] holds "http://" // $urlMatch[2] holds "subdomains.domain.tld" // $urlMatch[3] holds the path after the domain $http = ($urlMatch[1]) ? $urlMatch[1].$this->chr["zeroWidthSpace"] : "" ; $domainParts = preg_split('#(\-|\.)#', $urlMatch[2], -1, PREG_SPLIT_DELIM_CAPTURE); //this is a hack, but it works // first, we hyphenate each part // we need it formated like a group of words $parsedWordsLike = array(); foreach($domainParts as $key => $domainPart) { $parsedWordsLike[$key]["value"] = $domainPart; } // do the hyphenation $parsedWordsLike = $this->do_hyphenate($parsedWordsLike); // restore format foreach($parsedWordsLike as $key => $parsedWordLike) { $domainParts[$key] = $parsedWordLike["value"]; } foreach ($domainParts as $key => &$domainPart) { //then we swap out each soft-hyphen" with a zero-space $domainPart = str_replace($this->chr["softHyphen"], $this->chr["zeroWidthSpace"], $domainPart); //we also insert zero-spaces before periods and hyphens if($key > 0 && strlen($domainPart) == 1) { $domainPart = $this->chr["zeroWidthSpace"].$domainPart; } } //lastly let's recombine $domain = implode($domainParts); //break up the URL path to individual characters $pathParts = str_split($urlMatch[3], 1); $pathCount = count($pathParts); $path = ""; for($i = 0; $i < $pathCount; $i++) { $path .= (0 == $i || $pathCount - $i < $this->settings["urlMinAfterWrap"]) ? $pathParts[$i] : $this->chr["zeroWidthSpace"].$pathParts[$i]; } $parsedTextToken["value"] = $http.$domain.$path; } } return $parsedTextTokens; } // expecting parsedText tokens function wrap_emails($parsedTextTokens) { if(!isset($this->settings["emailWrap"]) || !$this->settings["emailWrap"]) return $parsedTextTokens; // test for and parse urls $validTLD = 'ac|ad|aero|ae|af|ag|ai|al|am|an|ao|aq|arpa|ar|asia|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|biz|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cat|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|com|coop|co|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|info|int|in|io|iq|ir|is|it|je|jm|jobs|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mobi|mo|mp|mq|mr|ms|mt|museum|mu|mv|mw|mx|my|mz|name|na|nc|net|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pro|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|travel|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw'; $emailPattern = "(?: \A [a-z0-9\!\#\$\%\&\'\*\+\/\=\?\^\_\`\{\|\}\~\-]+ (?: \. [a-z0-9\!\#\$\%\&\'\*\+\/\=\?\^\_\`\{\|\}\~\-]+ )* @ (?: [a-z0-9] [a-z0-9\-]{0,61} [a-z0-9] \. )+ (?: $validTLD ) \Z )"; // required modifiers: x (multiline pattern) i (case insensitive) foreach($parsedTextTokens as &$parsedTextToken) { if(preg_match("/$emailPattern/xi", $parsedTextToken["value"], $urlMatch)) { $parsedTextToken["value"] = preg_replace("/([^a-zA-Z])/", '$1'.$this->chr["zeroWidthSpace"], $parsedTextToken["value"]); } } return $parsedTextTokens; } // expecting parsedHTML token of type text // wraps words of all caps (may include numbers) in // only call if you are certain that no html tags have been injected containing capital letters // call before style_numbers function style_caps($parsedHTMLtoken) { if(!isset($this->settings["styleCaps"]) || !$this->settings["styleCaps"]) return $parsedHTMLtoken; // \p{Lu} equals upper case letters and should match non english characters; since PHP 4.4.0 and 5.1.0 // for more info, see http://www.regextester.com/pregsyntax.html#regexp.reference.unicode $pattern = ' (?chr["zeroWidthSpace"].$this->chr["softHyphen"].']) # negative lookbehind assertion ( (?: # CASE 1: " 9A " [0-9]+ # starts with at least one number \p{Lu} # must contain at least one capital letter (?:\p{Lu}|[0-9]|\-|_|'.$this->chr["zeroWidthSpace"].'|'.$this->chr["softHyphen"].')* # may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens ) | (?: # CASE 2: " A9 " \p{Lu} # starts with capital letter (?:\p{Lu}|[0-9]) # must be followed a number or capital letter (?:\p{Lu}|[0-9]|\-|_|'.$this->chr["zeroWidthSpace"].'|'.$this->chr["softHyphen"].')* # may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens ) ) (?![\w\-_'.$this->chr["zeroWidthSpace"].$this->chr["softHyphen"].']) # negative lookahead assertion '; // required modifiers: x (multiline pattern) u (utf8) $parsedHTMLtoken["value"] = preg_replace("/$pattern/xu", '$1', $parsedHTMLtoken["value"]); return $parsedHTMLtoken; } // expecting parsedHTML token of type text // wraps numbers in (even numbers that appear inside a word, i.e. A9 becomes A9) // call after style_caps so A9 becomes A9) // only call if you are certain that no html tags have been injected containing numbers // call after smart_fractions, smart_ordinal_suffix and style_caps function style_numbers($parsedHTMLtoken) { if(!isset($this->settings["styleNumbers"]) || !$this->settings["styleNumbers"]) return $parsedHTMLtoken; $pattern = '([0-9]+)'; // required modifier: u (utf8) $parsedHTMLtoken["value"] = preg_replace("/$pattern/u", '$1', $parsedHTMLtoken["value"]); return $parsedHTMLtoken; } // expecting parsedHTML token of type text // wraps ampersands in (i.e. H&J becomes H&J) // call after style_caps so H&J becomes H&J) // note that all standalone ampersands were previously converted to & // only call if you are certain that no html tags have been injected containing "&" function style_ampersands($parsedHTMLtoken) { if(!isset($this->settings["styleAmpersands"]) || !$this->settings["styleAmpersands"]) return $parsedHTMLtoken; $pattern = '(\&\;)'; // required modifier: u (utf8) $parsedHTMLtoken["value"] = preg_replace("/$pattern/u", '$1', $parsedHTMLtoken["value"]); return $parsedHTMLtoken; } // expecting parsedHTML token of type text // styles initial quotes and guillemets function style_initial_quotes($parsedHTMLtoken, $isTitle = FALSE) { if(!isset($this->settings["styleInitialQuotes"]) || !$this->settings["styleInitialQuotes"] || !isset($this->settings["initialQuoteTags"]) || !$this->settings["initialQuoteTags"]) return $parsedHTMLtoken; if(!isset($parsedHTMLtoken["prevChr"]) || $parsedHTMLtoken["prevChr"] == NULL) { // we have the first text in a block level element $encodings = array("ASCII","UTF-8", "ISO-8859-1"); $e = mb_detect_encoding($parsedHTMLtoken["value"]."a", $encodings);// ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936 if(!isset($e) || $e == "") $e = "ASCII"; $firstChr = mb_substr($parsedHTMLtoken["value"], 0, 1, $e); if($firstChr == "'" || $firstChr == $this->chr["singleQuoteOpen"] || $firstChr == $this->chr["singleLow9Quote"] || $firstChr == "," || $firstChr == "\"" || $firstChr == $this->chr["doubleQuoteOpen"] || $firstChr == $this->chr["guillemetOpen"] || $firstChr == $this->chr["guillemetClose"] || $firstChr == $this->chr["doubleLow9Quote"]) { $style = FALSE; $immediateParent = ""; if($parsedHTMLtoken["parents"]) { $immediateParent = end($parsedHTMLtoken["parents"]); } elseif($isTitle) { // assume page title is h2 $immediateParent = array("tagName" => "h2"); } // TD throws warnings for friendica // Warning: Illegal string offset 'tagName' in // /addon/typography/php-typography/php-typography.php // on line 1964 //if($immediateParent["tagName"]) { // foreach($this->settings["initialQuoteTags"] as $tag) { // if($tag == $immediateParent["tagName"]) // $style = TRUE; // } //} if($style) { if($firstChr == "'" || $firstChr == $this->chr["singleQuoteOpen"] || $firstChr == $this->chr["singleLow9Quote"] || $firstChr == ",") { $parsedHTMLtoken["value"] = ''.$firstChr.''.mb_substr($parsedHTMLtoken["value"], 1, mb_strlen($parsedHTMLtoken["value"], $e), $e); } else { // double quotes or guillemets $parsedHTMLtoken["value"] = ''.$firstChr.''.mb_substr($parsedHTMLtoken["value"], 1, mb_strlen($parsedHTMLtoken["value"], $e), $e); } } } } return $parsedHTMLtoken; } //injects the PatGen segments pattern into the PatGen words pattern function hyphenation_pattern_injection($wordPattern, $segmentPattern, $segmentPosition, $segmentLength) { for($numberPosition=$segmentPosition; $numberPosition <= $segmentPosition + $segmentLength; $numberPosition++) { $wordPattern[$numberPosition] = (intval($wordPattern[$numberPosition]) >= intval($segmentPattern[$numberPosition-$segmentPosition])) ? $wordPattern[$numberPosition] : $segmentPattern[$numberPosition-$segmentPosition]; } return $wordPattern; } // expecting parseText tokens filtered to words function hyphenate($parsedTextTokens, $isTitle = FALSE) { if(!isset($this->settings["hyphenation"]) || !$this->settings["hyphenation"]) return $parsedTextTokens; $isHeading = FALSE; if(isset($parsedTextTokens["parents"])) { foreach($parsedTextTokens["parents"] as $tagName) { if($tagName == "h1" || $tagName == "h2" || $tagName == "h3" || $tagName == "h4" || $tagName == "h5" || $tagName == "h6") $isHeading = TRUE; } } if((!isset($this->settings["hyphenateTitle"]) || !$this->settings["hyphenateTitle"]) && ($isTitle || $isHeading)) return $parsedTextTokens; // call functionality as seperate function so it can be run without test for setting["hyphenation"] - such as with url wrapping return $this->do_hyphenate($parsedTextTokens); } // expecting parsedText tokens filtered to words function do_hyphenate($parsedTextTokens) { if(!isset($this->settings["hyphenMinLength"]) || !$this->settings["hyphenMinLength"]) return $parsedTextTokens; if(!isset($this->settings["hyphenMinBefore"]) || !$this->settings["hyphenMinBefore"]) return $parsedTextTokens; if(!isset($this->settings["hyphenationPatternMaxSegment"])) return $parsedTextTokens; if(!isset($this->settings["hyphenationPatternExceptions"])) return $parsedTextTokens; if(!isset($this->settings["hyphenationPattern"])) return $parsedTextTokens; $encodings = array("ASCII","UTF-8", "ISO-8859-1"); $multibyte = FALSE; $u = ""; // make sure we have full exceptions list if(!isset($this->settings["hyphenationExceptions"])) { if($this->settings["hyphenationPatternExceptions"] || (isset($this->settings["hyphenationCustomExceptions"]) && $this->settings["hyphenationCustomExceptions"])) { $exceptions = array(); if(isset($this->settings["hyphenationCustomExceptions"])) { // merges custom and language specific word hyphenations $exceptions = array_merge($this->settings["hyphenationCustomExceptions"], $this->settings["hyphenationPatternExceptions"]); } else { $exceptions = $this->settings["hyphenationPatternExceptions"]; } $this->settings["hyphenationExceptions"] = $exceptions; } else { $this->settings["hyphenationExceptions"]=array(); } } foreach($parsedTextTokens as &$parsedTextToken) { // ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936 $encoding = mb_detect_encoding($parsedTextToken["value"]."a", $encodings); if("UTF-8" == $encoding) { $multibyte = TRUE; $u = "u"; if(!function_exists('mb_strlen')) continue; } elseif("ASCII" != $encoding) { continue; } if($multibyte) { $wordLength = mb_strlen($parsedTextToken["value"], "UTF-8"); $theKey = mb_strtolower($parsedTextToken["value"], "UTF-8"); } else { //same as above without mutlibyte string functions to improve preformance $wordLength = strlen($parsedTextToken["value"]); $theKey = strtolower($parsedTextToken["value"]); } if($wordLength < $this->settings["hyphenMinLength"]) continue; //if this is a capitalized word, and settings do not allow hyphenation of such, abort! // note. this is different than uppercase words, where we are looking for title case if((!isset($this->settings["hyphenateTitleCase"]) || !$this->settings["hyphenateTitleCase"]) && substr($theKey,0,1) != substr($parsedTextToken["value"],0,1)) continue; // give exceptions preference if(isset($this->settings["hyphenationExceptions"][$theKey])) { //Set the wordPattern - this method keeps any contextually important capitalization if($multibyte) { $lowercaseHyphenedWord = $this->settings["hyphenationExceptions"][$theKey]; $lhwArray = $this->mb_str_split($lowercaseHyphenedWord, 1, "UTF-8"); $lhwLength = mb_strlen($lowercaseHyphenedWord, "UTF-8"); } else { //same as above without mutlibyte string functions to improve preformance $lowercaseHyphenedWord = $this->settings["hyphenationExceptions"][$theKey]; $lhwArray = str_split($lowercaseHyphenedWord, 1); $lhwLength = strlen($lowercaseHyphenedWord); } $wordPattern=array(); for($i=0; $i < $lhwLength; $i++) { if("-" == $lhwArray[$i]) { array_push($wordPattern, "9"); $i++; } else { array_push($wordPattern, "0"); } } array_push($wordPattern, "0"); //for consistent length with the other word patterns } if(!isset($wordPattern)) { // first we set up the matching pattern to be a series of zeros one character longer than $parsedTextToken $wordPattern = array(); for($i=0; $i < $wordLength +1; $i++) { array_push($wordPattern, "0"); } // we grab all possible segments from $parsedTextToken of length 2 through $this->settings["hyphenationPatternMaxSegment"] for($segmentLength=2; ($segmentLength <= $wordLength) && ($segmentLength <= $this->settings["hyphenationPatternMaxSegment"]); $segmentLength++) { for($segmentPosition=0; $segmentPosition + $segmentLength <= $wordLength; $segmentPosition++) { if($multibyte) $segment = mb_strtolower(mb_substr($parsedTextToken["value"], $segmentPosition, $segmentLength, "UTF-8"), "UTF-8"); else $segment = strtolower(substr($parsedTextToken["value"], $segmentPosition, $segmentLength)); if(0 == $segmentPosition) { if(isset($this->settings["hyphenationPattern"]["begin"][$segment])) { if($multibyte) $segmentPattern = $this->mb_str_split($this->settings["hyphenationPattern"]["begin"][$segment], 1, "UTF-8"); else $segmentPattern = str_split($this->settings["hyphenationPattern"]["begin"][$segment], 1); $wordPattern = $this->hyphenation_pattern_injection($wordPattern, $segmentPattern, $segmentPosition, $segmentLength); } } if($segmentPosition + $segmentLength == $wordLength) { if(isset($this->settings["hyphenationPattern"]["end"][$segment])) { if($multibyte) $segmentPattern = $this->mb_str_split($this->settings["hyphenationPattern"]["end"][$segment], 1, "UTF-8"); else $segmentPattern = str_split($this->settings["hyphenationPattern"]["end"][$segment], 1); $wordPattern = $this->hyphenation_pattern_injection($wordPattern, $segmentPattern, $segmentPosition, $segmentLength); } } if(isset($this->settings["hyphenationPattern"]["all"][$segment])) { if($multibyte) $segmentPattern = $this->mb_str_split($this->settings["hyphenationPattern"]["all"][$segment], 1, "UTF-8"); else $segmentPattern = str_split($this->settings["hyphenationPattern"]["all"][$segment], 1); $wordPattern = $this->hyphenation_pattern_injection($wordPattern, $segmentPattern, $segmentPosition, $segmentLength); } } } } //add soft-hyphen based on $wordPattern if($multibyte) { $wordArray = $this->mb_str_split($parsedTextToken["value"], 1, "UTF-8"); } else { //same as above without mutlibyte string functions to improve preformance $wordArray = str_split($parsedTextToken["value"], 1); } $hyphenatedWord = ""; for($i=0; $i < $wordLength; $i++) { if(($this->is_odd(intval($wordPattern[$i]))) && ($i >= $this->settings["hyphenMinBefore"]) && ($i < $wordLength - $this->settings["hyphenMinAfter"])) { $hyphenatedWord .= $this->chr["softHyphen"].$wordArray[$i]; } else { $hyphenatedWord .= $wordArray[$i]; } } $parsedTextToken["value"] = $hyphenatedWord; unset($wordPattern); } return $parsedTextTokens; } ######################################################################## # params: $codes = decimal value cooresponding to unicode character # Returns: unicode character function uchr ($codes) { if (is_scalar($codes)) $codes= func_get_args(); $str= ''; foreach ($codes as $code) $str.= html_entity_decode('&#'.$code.';',ENT_NOQUOTES,'UTF-8'); return $str; } //is a number odd? returns 0 if even and 1 if odd function is_odd($number) { return $number % 2; } //multibyte character support is built in to accomodate language support of multibyte alphabets function mb_str_split($str, $length = 1, $encoding = 'UTF-8') { if(!function_exists('mb_strlen')) return FALSE; if ($length < 1) return FALSE; $result = array(); for ($i = 0; $i < mb_strlen($str, $encoding); $i += $length) { $result[] = mb_substr($str, $i, $length, $encoding); } return $result; } ########################################################################################## ########################################################################################## ########################################################################################## ### ### portions of this code have been inspired by: ### -typogrify (http://code.google.com/p/typogrify/) ### -WordPress code for wptexturize (http://xref.redalt.com/wptrunk/nav.htm?index.htm) ### -PHP SmartyPants Typographer (http://michelf.com/projects/php-smartypants/) ### }