2197 lines
91 KiB
PHP
2197 lines
91 KiB
PHP
<?php
|
|
/*
|
|
Project: PHP Typography
|
|
Project URI: http://kingdesk.com/projects/php-tyography/
|
|
Version: 1.21
|
|
|
|
|
|
Copyright 2009, KINGdesk, LLC. Licensed under the GNU General Public License 2.0. If you use, modify and/or redistribute this software, you must leave the KINGdesk, LLC copyright information, the request for a link to http://kingdesk.com, and the web design services contact information unchanged. If you redistribute this software, or any derivative, it must be released under the GNU General Public License 2.0. This program is distributed without warranty (implied or otherwise) of suitability for any particular purpose. See the GNU General Public License for full license terms <http://creativecommons.org/licenses/GPL/2.0/>.
|
|
|
|
WE DON'T WANT YOUR MONEY: NO TIPS NECESSARY! If you enjoy this plugin, a link to http://kingdesk.com from your website would be appreciated.
|
|
|
|
For web design services, please contact jeff@kingdesk.com.
|
|
*/
|
|
|
|
# if used with multibyte language, UTF-8 encoding is required!
|
|
class phpTypography {
|
|
|
|
var $mb = FALSE; //cannot be changed after load
|
|
var $chr = array();
|
|
var $settings = array(); // operational attributes
|
|
var $parsedHTML = array(); // to hold current instance of class parseHTML
|
|
var $parsedText = array(); // to hold current instance of class parseText
|
|
|
|
|
|
#=======================================================================
|
|
#=======================================================================
|
|
#== METHODS - SET ATTRIBUTES
|
|
#=======================================================================
|
|
#=======================================================================
|
|
|
|
// __ naming defines constructor that is automatically called on each newly-createy object
|
|
function __construct($setDefaults = TRUE) {
|
|
$this->chr["noBreakSpace"] = $this->uchr(160);
|
|
$this->chr["noBreakNarrowSpace"] = $this->uchr(160); //should be 8239, but not supported consistently, used in unit spacing
|
|
$this->chr["copyright"] = $this->uchr(169);
|
|
$this->chr["guillemetOpen"] = $this->uchr(171);
|
|
$this->chr["softHyphen"] = $this->uchr(173);
|
|
$this->chr["registeredMark"] = $this->uchr(174);
|
|
$this->chr["guillemetClose"] = $this->uchr(187);
|
|
$this->chr["multiplication"] = $this->uchr(215);
|
|
$this->chr["division"] = $this->uchr(247);
|
|
$this->chr["figureSpace"] = $this->uchr(8199);
|
|
$this->chr["thinSpace"] = $this->uchr(8201);
|
|
$this->chr["zeroWidthSpace"] = $this->uchr(8203);
|
|
$this->chr["hyphen"] = "-"; // should be $this->uchr(8208), but IE6 chokes;
|
|
$this->chr["noBreakHyphen"] = $this->uchr(8209);
|
|
$this->chr["enDash"] = $this->uchr(8211);
|
|
$this->chr["emDash"] = $this->uchr(8212);
|
|
$this->chr["singleQuoteOpen"] = $this->uchr(8216); // reset in set_smart_quotes_language()
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8217); // reset in set_smart_quotes_language()
|
|
$this->chr["apostrophe"] = $this->uchr(8217); // defined seperate from singleQuoteClose so quotes can be redefined in set_smart_quotes_language() without disrupting apostrophies
|
|
$this->chr["singleLow9Quote"] = $this->uchr(8218);
|
|
$this->chr["doubleQuoteOpen"] = $this->uchr(8220); // reset in set_smart_quotes_language()
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8221); // reset in set_smart_quotes_language()
|
|
$this->chr["doubleLow9Quote"] = $this->uchr(8222);
|
|
$this->chr["ellipses"] = $this->uchr(8230);
|
|
$this->chr["singlePrime"] = $this->uchr(8242);
|
|
$this->chr["doublePrime"] = $this->uchr(8243);
|
|
$this->chr["singleAngleQuoteOpen"] = $this->uchr(8249);
|
|
$this->chr["singleAngleQuoteClose"] = $this->uchr(8250);
|
|
$this->chr["fractionSlash"] = $this->uchr(8260);
|
|
$this->chr["soundCopyMark"] = $this->uchr(8471);
|
|
$this->chr["serviceMark"] = $this->uchr(8480);
|
|
$this->chr["tradeMark"] = $this->uchr(8482);
|
|
$this->chr["minus"] = $this->uchr(8722);
|
|
$this->chr["leftCornerBracket"] = $this->uchr(12300);
|
|
$this->chr["rightCornerBracket"] = $this->uchr(12301);
|
|
$this->chr["leftWhiteCornerBracket"] = $this->uchr(12302);
|
|
$this->chr["rightWhiteCornerBracket"] = $this->uchr(12303);
|
|
|
|
if($setDefaults) {
|
|
$this->set_defaults();
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
function set_defaults() {
|
|
|
|
// general attributes
|
|
$this->set_tags_to_ignore();
|
|
$this->set_classes_to_ignore();
|
|
$this->set_ids_to_ignore();
|
|
|
|
//smart characters
|
|
$this->set_smart_quotes();
|
|
//DEPRECIATED $this->set_smart_quotes_language();
|
|
$this->set_smart_quotes_primary(); /* added in version 1.15 */
|
|
$this->set_smart_quotes_secondary(); /* added in version 1.15 */
|
|
$this->set_smart_dashes();
|
|
$this->set_smart_ellipses();
|
|
$this->set_smart_diacritics();
|
|
$this->set_diacritic_language();
|
|
$this->set_diacritic_custom_replacements();
|
|
$this->set_smart_marks();
|
|
$this->set_smart_ordinal_suffix();
|
|
$this->set_smart_math();
|
|
$this->set_smart_fractions();
|
|
$this->set_smart_exponents();
|
|
// DEPRECIATED: $this->set_smart_multiplication();
|
|
|
|
//smart spacing
|
|
$this->set_single_character_word_spacing();
|
|
$this->set_fraction_spacing();
|
|
$this->set_unit_spacing();
|
|
$this->set_units();
|
|
$this->set_dash_spacing();
|
|
$this->set_dewidow();
|
|
$this->set_max_dewidow_length();
|
|
$this->set_max_dewidow_pull();
|
|
$this->set_wrap_hard_hyphens();
|
|
$this->set_url_wrap();
|
|
$this->set_email_wrap();
|
|
$this->set_min_after_url_wrap();
|
|
$this->set_space_collapse();
|
|
|
|
//character styling
|
|
$this->set_style_ampersands();
|
|
$this->set_style_caps();
|
|
$this->set_style_initial_quotes();
|
|
$this->set_style_numbers();
|
|
$this->set_initial_quote_tags();
|
|
|
|
//hyphenation
|
|
$this->set_hyphenation();
|
|
$this->set_hyphenation_language();
|
|
$this->set_min_length_hyphenation();
|
|
$this->set_min_before_hyphenation();
|
|
$this->set_min_after_hyphenation();
|
|
$this->set_hyphenate_headings();
|
|
$this->set_hyphenate_all_caps();
|
|
$this->set_hyphenate_title_case(); // added in version 1.5
|
|
$this->set_hyphenation_exceptions();
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
// sets tags where typography of children will be untouched
|
|
function set_tags_to_ignore($tags = array("code", "head", "kbd", "object", "option", "pre", "samp", "script", "select", "style", "textarea", "title", "var", "math")) {
|
|
if(!is_array($tags))
|
|
$tags = preg_split("/[\s,]+/", $tags, -1, PREG_SPLIT_NO_EMPTY);
|
|
foreach($tags as &$tag){
|
|
$tag = strtolower($tag);
|
|
}
|
|
|
|
// self closing tags shouldn't be in $tags
|
|
$selfClosingTags = array('area', 'base', 'basefont', 'br', 'frame', 'hr', 'img', 'input', 'link', 'meta');
|
|
$tagsCount = count($tags);
|
|
// don't use foreach, we need to modify the array we are indexing through
|
|
$key = 0; //we need to look through every initial key ($i), but the total key count will reduce over time ($key)
|
|
for($i=0; $i<$tagsCount; $i++) {
|
|
if(FALSE !== array_search($tags[$key], $selfClosingTags)) {
|
|
$tags =array_merge(array_slice($tags, 0, $key), array_slice($tags, $key+1)); // array_merge renumbers numeric keys!
|
|
$key--; //adjust for shorter array
|
|
}
|
|
$key++;
|
|
}
|
|
|
|
// include all inappropriate tags in $tags
|
|
$inappropriateTags = array('iframe', 'textarea', 'button', 'select', 'optgroup', 'option' ,'map', 'style', 'head', 'title', 'script', 'applet', 'object', 'param');
|
|
foreach($inappropriateTags as $inappropriateTag) {
|
|
if(FALSE === array_search($inappropriateTag, $tags)) {
|
|
array_push($tags, $inappropriateTag);
|
|
}
|
|
}
|
|
|
|
$this->settings["ignoreTags"] = $tags;
|
|
return TRUE;
|
|
}
|
|
|
|
// sets classes where typography of children will be untouched
|
|
function set_classes_to_ignore($classes = array("vcard", "noTypo")) {
|
|
if(!is_array($classes))
|
|
$classes = preg_split("/[\s,]+/", $classes, -1, PREG_SPLIT_NO_EMPTY);
|
|
$this->settings["ignoreClasses"] = $classes;
|
|
return TRUE;
|
|
}
|
|
|
|
// sets IDs where typography of children will be untouched
|
|
function set_ids_to_ignore($ids = array()) {
|
|
if(!is_array($ids))
|
|
$ids = preg_split("/[\s,]+/", $ids, -1, PREG_SPLIT_NO_EMPTY);
|
|
$this->settings["ignoreIDs"] = $ids;
|
|
return TRUE;
|
|
}
|
|
|
|
// curl quotemarks
|
|
function set_smart_quotes($on = TRUE) {
|
|
$this->settings["smartQuotes"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// DEPRECIATED
|
|
// language preferences for curling quotemarks
|
|
// allowed values for $lang
|
|
// "en" = English style quotes, replaces "foo" with “foo”
|
|
// "de" = German style quotes, replaces "foo" with „foo”
|
|
// "fr" = French guillemets, replaces "foo" with «foo»
|
|
// "fr-reverse" = Reverse French guillemets, replaces "foo" with »foo«
|
|
function set_smart_quotes_language($lang = "en") {
|
|
if($lang == "de") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["doubleLow9Quote"];
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8220);
|
|
$this->chr["singleQuoteOpen"] = $this->chr["singleLow9Quote"];
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8216);
|
|
} elseif($lang == "fr") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["guillemetOpen"];
|
|
$this->chr["doubleQuoteClose"] = $this->chr["guillemetClose"];
|
|
$this->chr["singleQuoteOpen"] = $this->chr["singleAngleQuoteOpen"];
|
|
$this->chr["singleQuoteClose"] = $this->chr["singleAngleQuoteClose"];
|
|
} elseif($lang == "fr-reverse") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["guillemetClose"];
|
|
$this->chr["doubleQuoteClose"] = $this->chr["guillemetOpen"];
|
|
$this->chr["singleQuoteOpen"] = $this->chr["singleAngleQuoteClose"];
|
|
$this->chr["singleQuoteClose"] = $this->chr["singleAngleQuoteOpen"];
|
|
} else {
|
|
$this->chr["doubleQuoteOpen"] = $this->uchr(8220);
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8221);
|
|
$this->chr["singleQuoteOpen"] = $this->uchr(8216);
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8217);
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
// Primary quotemarks style
|
|
// allowed values for $style
|
|
// "doubleCurled" => "“foo”",
|
|
// "doubleCurledReversed" => "”foo”",
|
|
// "doubleLow9" => "„foo”",
|
|
// "doubleLow9Reversed" => "„foo“",
|
|
// "singleCurled" => "‘foo’",
|
|
// "singleCurledReversed" => "’foo’",
|
|
// "singleLow9" => "‚foo’",
|
|
// "singleLow9Reversed" => "‚foo‘",
|
|
// "doubleGuillemetsFrench" => "« foo »",
|
|
// "doubleGuillemets" => "«foo»",
|
|
// "doubleGuillemetsReversed" => "»foo«",
|
|
// "singleGuillemets" => "‹foo›",
|
|
// "singleGuillemetsReversed" => "›foo‹",
|
|
// "cornerBrackets" => "「foo」",
|
|
// "whiteCornerBracket" => "『foo』",
|
|
function set_smart_quotes_primary($style = "doubleCurled") {
|
|
if($style == "doubleCurled") {
|
|
$this->chr["doubleQuoteOpen"] = $this->uchr(8220);
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8221);
|
|
} elseif($style == "doubleCurledReversed") {
|
|
$this->chr["doubleQuoteOpen"] = $this->uchr(8221);
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8221);
|
|
} elseif($style == "doubleLow9") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["doubleLow9Quote"];
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8221);
|
|
} elseif($style == "doubleLow9Reversed") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["doubleLow9Quote"];
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8220);
|
|
} elseif($style == "singleCurled") {
|
|
$this->chr["doubleQuoteOpen"] = $this->uchr(8216);
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8217);
|
|
} elseif($style == "singleCurledReversed") {
|
|
$this->chr["doubleQuoteOpen"] = $this->uchr(8217);
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8217);
|
|
} elseif($style == "singleLow9") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["singleLow9Quote"];
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8217);
|
|
} elseif($style == "singleLow9Reversed") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["singleLow9Quote"];
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8216);
|
|
} elseif($style == "doubleGuillemetsFrench") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["guillemetOpen"].$this->chr["noBreakSpace"];
|
|
$this->chr["doubleQuoteClose"] = $this->chr["noBreakSpace"].$this->chr["guillemetClose"];
|
|
} elseif($style == "doubleGuillemets") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["guillemetOpen"];
|
|
$this->chr["doubleQuoteClose"] = $this->chr["guillemetClose"];
|
|
} elseif($style == "doubleGuillemetsReversed") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["guillemetClose"];
|
|
$this->chr["doubleQuoteClose"] = $this->chr["guillemetOpen"];
|
|
} elseif($style == "singleGuillemets") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["singleAngleQuoteOpen"];
|
|
$this->chr["doubleQuoteClose"] = $this->chr["singleAngleQuoteClose"];
|
|
} elseif($style == "singleGuillemetsReversed") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["singleAngleQuoteClose"];
|
|
$this->chr["doubleQuoteClose"] = $this->chr["singleAngleQuoteOpen"];
|
|
} elseif($style == "cornerBrackets") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["leftCornerBracket"];
|
|
$this->chr["doubleQuoteClose"] = $this->chr["rightCornerBracket"];
|
|
} elseif($style == "whiteCornerBracket") {
|
|
$this->chr["doubleQuoteOpen"] = $this->chr["leftWhiteCornerBracket"];
|
|
$this->chr["doubleQuoteClose"] = $this->chr["rightWhiteCornerBracket"];
|
|
} else {
|
|
$this->chr["doubleQuoteOpen"] = $this->uchr(8220);
|
|
$this->chr["doubleQuoteClose"] = $this->uchr(8221);
|
|
}
|
|
return TRUE;
|
|
}
|
|
// Secondary quotemarks style
|
|
// allowed values for $style
|
|
// "doubleCurled" => "“foo”",
|
|
// "doubleCurledReversed" => "”foo”",
|
|
// "doubleLow9" => "„foo”",
|
|
// "doubleLow9Reversed" => "„foo“",
|
|
// "singleCurled" => "‘foo’",
|
|
// "singleCurledReversed" => "’foo’",
|
|
// "singleLow9" => "‚foo’",
|
|
// "singleLow9Reversed" => "‚foo‘",
|
|
// "doubleGuillemetsFrench" => "« foo »",
|
|
// "doubleGuillemets" => "«foo»",
|
|
// "doubleGuillemetsReversed" => "»foo«",
|
|
// "singleGuillemets" => "‹foo›",
|
|
// "singleGuillemetsReversed" => "›foo‹",
|
|
// "cornerBrackets" => "「foo」",
|
|
// "whiteCornerBracket" => "『foo』",
|
|
function set_smart_quotes_secondary($style = "singleCurled") {
|
|
if($style == "doubleCurled") {
|
|
$this->chr["singleQuoteOpen"] = $this->uchr(8220);
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8221);
|
|
} elseif($style == "doubleCurledReversed") {
|
|
$this->chr["singleQuoteOpen"] = $this->uchr(8221);
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8221);
|
|
} elseif($style == "doubleLow9") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["doubleLow9Quote"];
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8221);
|
|
} elseif($style == "doubleLow9Reversed") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["doubleLow9Quote"];
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8220);
|
|
} elseif($style == "singleCurled") {
|
|
$this->chr["singleQuoteOpen"] = $this->uchr(8216);
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8217);
|
|
} elseif($style == "singleCurledReversed") {
|
|
$this->chr["singleQuoteOpen"] = $this->uchr(8217);
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8217);
|
|
} elseif($style == "singleLow9") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["singleLow9Quote"];
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8217);
|
|
} elseif($style == "singleLow9Reversed") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["singleLow9Quote"];
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8216);
|
|
} elseif($style == "doubleGuillemetsFrench") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["guillemetOpen"].$this->chr["noBreakSpace"];
|
|
$this->chr["singleQuoteClose"] = $this->chr["noBreakSpace"].$this->chr["guillemetClose"];
|
|
} elseif($style == "doubleGuillemets") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["guillemetOpen"];
|
|
$this->chr["singleQuoteClose"] = $this->chr["guillemetClose"];
|
|
} elseif($style == "doubleGuillemetsReversed") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["guillemetClose"];
|
|
$this->chr["singleQuoteClose"] = $this->chr["guillemetOpen"];
|
|
} elseif($style == "singleGuillemets") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["singleAngleQuoteOpen"];
|
|
$this->chr["singleQuoteClose"] = $this->chr["singleAngleQuoteClose"];
|
|
} elseif($style == "singleGuillemetsReversed") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["singleAngleQuoteClose"];
|
|
$this->chr["singleQuoteClose"] = $this->chr["singleAngleQuoteOpen"];
|
|
} elseif($style == "cornerBrackets") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["leftCornerBracket"];
|
|
$this->chr["singleQuoteClose"] = $this->chr["rightCornerBracket"];
|
|
} elseif($style == "whiteCornerBracket") {
|
|
$this->chr["singleQuoteOpen"] = $this->chr["leftWhiteCornerBracket"];
|
|
$this->chr["singleQuoteClose"] = $this->chr["rightWhiteCornerBracket"];
|
|
} else {
|
|
$this->chr["singleQuoteOpen"] = $this->uchr(8216);
|
|
$this->chr["singleQuoteClose"] = $this->uchr(8217);
|
|
}
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// replaces "a--a" with En Dash " -- " and "---" with Em Dash
|
|
function set_smart_dashes($on = TRUE) {
|
|
$this->settings["smartDashes"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// replaces "..." with "…"
|
|
function set_smart_ellipses($on = TRUE) {
|
|
$this->settings["smartEllipses"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// replaces "creme brulee" with "crème brûlée"
|
|
function set_smart_diacritics($on = TRUE) {
|
|
$this->settings["smartDiacritics"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// defines hyphenation language for text
|
|
function set_diacritic_language($lang = "en-US") {
|
|
if (isset($this->settings["diacriticLanguage"]) && $this->settings["diacriticLanguage"] == $lang)
|
|
return TRUE;
|
|
|
|
$this->settings["diacriticLanguage"] = $lang;
|
|
|
|
if(file_exists(dirname(__FILE__).'/diacritics/'.$this->settings["diacriticLanguage"].'.php')) {
|
|
include('diacritics/'.$this->settings["diacriticLanguage"].'.php');
|
|
} else {
|
|
include('diacritics/en-US.php');
|
|
}
|
|
$this->settings["diacriticWords"] = $diacriticWords;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
// $customReplacements must be
|
|
// an array formatted array(needle=>replacement, needle=>replacement...), or
|
|
// a string formatted `"needle"=>"replacement","needle"=>"replacement",...`
|
|
function set_diacritic_custom_replacements($customReplacements = array()) {
|
|
$replacements = array();
|
|
if(!is_array($customReplacements))
|
|
$customReplacements = preg_split("/,/", $customReplacements, -1, PREG_SPLIT_NO_EMPTY);
|
|
foreach($customReplacements as $customReplacement) {
|
|
//account for single and double quotes
|
|
preg_match("/(?:\")([^\"]+)(?:\"\s*=>)/", $customReplacement, $doubleQuoteKeyMatch);
|
|
preg_match("/(?:')([^']+)(?:'\s*=>)/", $customReplacement, $singleQuoteKeyMatch);
|
|
preg_match("/(?:=>\s*\")([^\"]+)(?:\")/", $customReplacement, $doubleQuoteValueMatch);
|
|
preg_match("/(?:=>\s*')([^']+)(?:')/", $customReplacement, $singleQuoteValueMatch);
|
|
|
|
if( isset($doubleQuoteKeyMatch[1]) && ( $doubleQuoteKeyMatch[1] != "" ) ) {
|
|
$key = $doubleQuoteKeyMatch[1];
|
|
} elseif( isset($singleQuoteKeyMatch[1]) && ( $singleQuoteKeyMatch[1] != "" ) ) {
|
|
$key = $singleQuoteKeyMatch[1];
|
|
}
|
|
|
|
if( isset($doubleQuoteValueMatch[1]) && ( $doubleQuoteValueMatch[1] != "" ) ) {
|
|
$value = $doubleQuoteValueMatch[1];
|
|
} elseif( isset($singleQuoteValueMatch[1]) && ( $singleQuoteValueMatch[1] != "" ) ) {
|
|
$value = $singleQuoteValueMatch[1];
|
|
}
|
|
|
|
if( isset($key) && isset($value) ) {
|
|
$replacements[strip_tags(trim($key))] = strip_tags(trim($value));
|
|
}
|
|
}
|
|
|
|
$this->settings["diacriticCustomReplacements"] = $replacements;
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
// replaces (r) (c) (tm) (sm) (p) (R) (C) (TM) (SM) (P) with ® © ™ ℠ ℗
|
|
function set_smart_marks($on = TRUE) {
|
|
$this->settings["smartMarks"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// replaces 1/4 with <sup>1</sup>⁄<sub>4</sub>
|
|
function set_smart_math($on = TRUE) {
|
|
$this->settings["smartMath"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// replaces 1/4 with <sup>1</sup>⁄<sub>4</sub>
|
|
function set_smart_exponents($on = TRUE) {
|
|
$this->settings["smartExponents"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// replaces 1/4 with <sup>1</sup>⁄<sub>4</sub>
|
|
function set_smart_fractions($on = TRUE) {
|
|
$this->settings["smartFractions"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// DEPRECIATED
|
|
function set_smart_multiplication($on = TRUE) {
|
|
$this->settings["smartMath"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// wrap numbers in <span class="numbers">
|
|
function set_smart_ordinal_suffix($on = TRUE) {
|
|
$this->settings["smartOrdinalSuffix"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// single character words are forced to next line with insertion of
|
|
function set_single_character_word_spacing($on = TRUE) {
|
|
$this->settings["singleCharacterWordSpacing"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// units and values are kept together with insertion of
|
|
function set_fraction_spacing($on = TRUE) {
|
|
$this->settings["fractionSpacing"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// units and values are kept together with insertion of
|
|
function set_unit_spacing($on = TRUE) {
|
|
$this->settings["unitSpacing"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// a list of units to keep with their values
|
|
function set_units($units = array()) {
|
|
if(!is_array($units))
|
|
$units = preg_split("/[\s,]+/", $units, -1, PREG_SPLIT_NO_EMPTY);
|
|
$this->settings["units"] = $units;
|
|
return TRUE;
|
|
}
|
|
|
|
// Em and En dashes are wrapped in thin spaces
|
|
function set_dash_spacing($on = TRUE) {
|
|
$this->settings["dashSpacing"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// Remove extra space Characters
|
|
function set_space_collapse($on = TRUE) {
|
|
$this->settings["spaceCollapse"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// enables widow handling
|
|
function set_dewidow($on = TRUE) {
|
|
$this->settings["dewidow"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// establishes maximum length of a widows that will be protected
|
|
function set_max_dewidow_length($len = 5) {
|
|
$len = ($len > 1) ? $len : 5;
|
|
$this->settings["dewidowMaxLength"] = $len;
|
|
return TRUE;
|
|
}
|
|
|
|
// establishes maximum length of pulled text to keep widows company
|
|
function set_max_dewidow_pull($len = 5) {
|
|
$len = ($len > 1) ? $len : 5;
|
|
$this->settings["dewidowMaxPull"] = $len;
|
|
return TRUE;
|
|
}
|
|
|
|
// enables wrapping at hard hyphens internal to a word with the insertion of a zero-width-space
|
|
function set_wrap_hard_hyphens($on = TRUE) {
|
|
$this->settings["hyphenHardWrap"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// enables wrapping of urls
|
|
function set_url_wrap($on = TRUE) {
|
|
$this->settings["urlWrap"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// enables wrapping of email addresses
|
|
function set_email_wrap($on = TRUE) {
|
|
$this->settings["emailWrap"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// establishes minimum character requirement after a url wrapping point
|
|
function set_min_after_url_wrap($len = 5) {
|
|
$len = ($len > 0) ? $len : 5;
|
|
$this->settings["urlMinAfterWrap"] = $len;
|
|
return TRUE;
|
|
}
|
|
|
|
// wrap ampersands in <span class="amp">
|
|
function set_style_ampersands($on = TRUE) {
|
|
$this->settings["styleAmpersands"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// wrap caps in <span class="caps">
|
|
function set_style_caps($on = TRUE) {
|
|
$this->settings["styleCaps"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// wrap initial quotes in <span class="quo"> or <span class="dquo">
|
|
function set_style_initial_quotes($on = TRUE) {
|
|
$this->settings["styleInitialQuotes"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// wrap numbers in <span class="numbers">
|
|
function set_style_numbers($on = TRUE) {
|
|
$this->settings["styleNumbers"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// sets tags where initial quotes and guillemets should be styled
|
|
function set_initial_quote_tags($tags = array("p", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "li", "dd", "dt")) {
|
|
if(!is_array($tags))
|
|
$tags = preg_split("/[^a-z0-9]+/", $tags, -1, PREG_SPLIT_NO_EMPTY);
|
|
foreach($tags as &$tag){
|
|
$tag = strtolower($tag);
|
|
}
|
|
$this->settings["initialQuoteTags"] = $tags;
|
|
return TRUE;
|
|
}
|
|
|
|
// enables hyphenation of text
|
|
function set_hyphenation($on = TRUE) {
|
|
$this->settings["hyphenation"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// defines hyphenation language for text
|
|
function set_hyphenation_language($lang = "en-US") {
|
|
if (isset($this->settings["hyphenLanguage"]) && $this->settings["hyphenLanguage"] == $lang)
|
|
return TRUE;
|
|
|
|
$this->settings["hyphenLanguage"] = $lang;
|
|
|
|
if(file_exists(dirname(__FILE__).'/lang/'.$this->settings["hyphenLanguage"].'.php')) {
|
|
include('lang/'.$this->settings["hyphenLanguage"].'.php');
|
|
} else {
|
|
include('lang/en-US.php');
|
|
}
|
|
$this->settings["hyphenationPattern"] = $patgen;
|
|
$this->settings["hyphenationPatternMaxSegment"] = $patgenMaxSeg;
|
|
$this->settings["hyphenationPatternExceptions"] = $patgenExceptions;
|
|
|
|
// make sure hyphenationExceptions is not set to force remerging of patgen and custom exceptions
|
|
if(isset($this->settings["hyphenationExceptions"])) unset($this->settings["hyphenationExceptions"]);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
// establishes minimum length of a word that may be hyphenated
|
|
function set_min_length_hyphenation($len = 5) {
|
|
$len = ($len > 1) ? $len : 5;
|
|
$this->settings["hyphenMinLength"] = $len;
|
|
return TRUE;
|
|
}
|
|
|
|
// establishes minimum character requirement before a hyphenation point
|
|
function set_min_before_hyphenation($len = 3) {
|
|
$len = ($len > 0) ? $len : 3;
|
|
$this->settings["hyphenMinBefore"] = $len;
|
|
return TRUE;
|
|
}
|
|
|
|
// establishes minimum character requirement after a hyphenation point
|
|
function set_min_after_hyphenation($len = 2) {
|
|
$len = ($len > 0) ? $len : 2;
|
|
$this->settings["hyphenMinAfter"] = $len;
|
|
return TRUE;
|
|
}
|
|
|
|
// allows/disallows hyphenation of title/heading text
|
|
function set_hyphenate_headings($on = TRUE) {
|
|
$this->settings["hyphenateTitle"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// allows hyphenation of strings of all capital characters
|
|
function set_hyphenate_all_caps($on = TRUE) {
|
|
$this->settings["hyphenateAllCaps"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// allows hyphenation of strings of all capital characters
|
|
// added in version 1.5
|
|
function set_hyphenate_title_case($on = TRUE) {
|
|
$this->settings["hyphenateTitleCase"] = $on;
|
|
return TRUE;
|
|
}
|
|
|
|
// defines custom word hyphenations
|
|
// expected input is an array of words with all hyphenation points marked with a hard hyphen
|
|
function set_hyphenation_exceptions($exceptions = array()) {
|
|
|
|
$encodings = array("ASCII","UTF-8", "ISO-8859-1");
|
|
$multibyte = FALSE;
|
|
$u = "";
|
|
if(!is_array($exceptions))
|
|
$exceptions = preg_split("/[^a-zA-Z0-9\-]+/", $exceptions, -1, PREG_SPLIT_NO_EMPTY);
|
|
|
|
$exceptionKeys = array();
|
|
foreach($exceptions as $key => &$exception) {
|
|
|
|
|
|
$encoding = mb_detect_encoding($exception."a", $encodings);
|
|
if("UTF-8" == $encoding) {
|
|
$multibyte = TRUE;
|
|
$u = "u";
|
|
if(!function_exists('mb_strlen')) return FALSE;
|
|
} elseif("ASCII" == $encoding) {
|
|
$multibyte = FALSE;
|
|
} else {
|
|
return FALSE;
|
|
}
|
|
|
|
if($multibyte) {
|
|
$exception = mb_strtolower($exception, "UTF-8");
|
|
} else { //same as above without multibyte string functions to improve preformance
|
|
$exception = strtolower($exception);
|
|
}
|
|
$exceptionKeys[$key] = preg_replace("#-#$u", "", $exception);
|
|
}
|
|
$e = array();
|
|
foreach($exceptionKeys as $key => $value) {
|
|
$e[$value] = $exceptions[$key];
|
|
}
|
|
|
|
$this->settings["hyphenationCustomExceptions"] = $e;
|
|
|
|
// make sure hyphenationExceptions is not set to force remerging of patgen and custom exceptions
|
|
if(isset($this->settings["hyphenationExceptions"])) unset($this->settings["hyphenationExceptions"]);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
|
|
#=======================================================================
|
|
#=======================================================================
|
|
#== METHODS - ACTIONS, let's do something!
|
|
#=======================================================================
|
|
#=======================================================================
|
|
|
|
|
|
# Returns: ARRAY of supported hyphenation languages in the form array( language code => language name)
|
|
function get_languages() {
|
|
$languages = array();
|
|
$langDir = dirname(__FILE__)."/lang/";
|
|
$handler = opendir($langDir);
|
|
|
|
// read all files in directory
|
|
while ($file = readdir($handler)) {
|
|
// we only want the php files
|
|
if (substr($file, -4) == ".php") {
|
|
$fileContent = file_get_contents($langDir.$file);
|
|
preg_match('/\$patgenLanguage\s*=\s*((".+")|(\'.+\'))\s*;/', $fileContent, $matches);
|
|
$languageName = substr($matches[1], 1, -1);
|
|
$languageCode = substr($file, 0, -4);
|
|
$results[$languageCode] = $languageName;
|
|
}
|
|
}
|
|
closedir($handler);
|
|
|
|
asort($results);
|
|
return $results;
|
|
}
|
|
|
|
# Returns: ARRAY of supported hyphenation languages in the form array( language code => language name)
|
|
function get_diacritic_languages() {
|
|
$languages = array();
|
|
$langDir = dirname(__FILE__)."/diacritics/";
|
|
$handler = opendir($langDir);
|
|
|
|
// read all files in directory
|
|
while ($file = readdir($handler)) {
|
|
// we only want the php files
|
|
if (substr($file, -4) == ".php") {
|
|
$fileContent = file_get_contents($langDir.$file);
|
|
preg_match('/\$diacriticLanguage\s*=\s*((".+")|(\'.+\'))\s*;/', $fileContent, $matches);
|
|
$languageName = substr($matches[1], 1, -1);
|
|
$languageCode = substr($file, 0, -4);
|
|
$results[$languageCode] = $languageName;
|
|
}
|
|
}
|
|
closedir($handler);
|
|
|
|
asort($results);
|
|
return $results;
|
|
}
|
|
|
|
# Action: modifies $html according to the defined settings
|
|
# Returns: processed $html
|
|
function process($html, $isTitle = FALSE) {
|
|
|
|
if( isset($this->settings["ignoreTags"] ) && $isTitle && ( in_array('h1', $this->settings["ignoreTags"]) || in_array('h2', $this->settings["ignoreTags"]) ) )
|
|
return $html;
|
|
|
|
require_once("php-parser/php-parser.php");
|
|
|
|
// parse the html
|
|
$this->parsedHTML = new parseHTML();
|
|
$this->parsedHTML->load($html);
|
|
$this->parsedHTML->unlock_text();
|
|
$tagsToIgnore = $this->parsedHTML->get_tags_by_name($this->settings["ignoreTags"]);
|
|
if(isset($this->settings["ignoreClasses"]))
|
|
$tagsToIgnore += $this->parsedHTML->get_tags_by_class($this->settings["ignoreClasses"]); //union to avoid dup keys
|
|
if(isset($this->settings["ignoreIDs"]))
|
|
$tagsToIgnore += $this->parsedHTML->get_tag_by_id($this->settings["ignoreIDs"]); //union to avoid dup keys
|
|
$this->parsedHTML->lock_children($tagsToIgnore);
|
|
$unlockedTexts = $this->parsedHTML->get_unlocked_text();
|
|
|
|
foreach($unlockedTexts as &$unlockedText) {
|
|
|
|
// we won't be doing anything with spaces, so we can jump ship if that is all we have
|
|
if (0 == strlen(trim($unlockedText["value"]))) continue;
|
|
|
|
// decode all characters except < > &
|
|
$unlockedText["value"] = html_entity_decode($unlockedText["value"], ENT_QUOTES, "UTF-8"); //converts all HTML entities to their applicable characters
|
|
$unlockedText["value"] = htmlspecialchars($unlockedText["value"], ENT_NOQUOTES, "UTF-8"); //returns < > & to encoded HTML characters (< > and & respectively)
|
|
|
|
// modify anything that requires adjacent text awareness here
|
|
$unlockedText = $this->smart_math($unlockedText);
|
|
$unlockedText = $this->smart_diacritics($unlockedText);
|
|
$unlockedText = $this->smart_quotes($unlockedText);
|
|
$unlockedText = $this->smart_dashes($unlockedText);
|
|
$unlockedText = $this->smart_ellipses($unlockedText);
|
|
$unlockedText = $this->smart_marks($unlockedText);
|
|
|
|
//keep spacing after smart character replacement
|
|
$unlockedText = $this->single_character_word_spacing($unlockedText);
|
|
$unlockedText = $this->dash_spacing($unlockedText);
|
|
$unlockedText = $this->unit_spacing($unlockedText);
|
|
|
|
//break it down for a bit more granularity
|
|
$this->parsedText = new parseText();
|
|
$this->parsedText->load($unlockedText);
|
|
$parsedMixedWords = $this->parsedText->get_words(-1,0); // prohibit letter only words, allow caps
|
|
$caps = (isset($this->settings["hyphenateAllCaps"]) && $this->settings["hyphenateAllCaps"]) ? 0 : -1 ;
|
|
$parsedWords = $this->parsedText->get_words(1,$caps); // require letter only words, caps allowance in settingibutes; mutually exclusive with $parsedMixedWords
|
|
$parsedOther = $this->parsedText->get_other();
|
|
|
|
// process individual text parts here
|
|
$parsedMixedWords = $this->wrap_hard_hyphens($parsedMixedWords);
|
|
$parsedWords = $this->hyphenate($parsedWords, $isTitle);
|
|
$parsedOther = $this->wrap_urls($parsedOther);
|
|
$parsedOther = $this->wrap_emails($parsedOther);
|
|
|
|
//apply updates to unlockedText
|
|
$this->parsedText->update($parsedMixedWords+$parsedWords+$parsedOther);
|
|
$unlockedText = $this->parsedText->unload();
|
|
|
|
//some final space manipulation
|
|
$unlockedText = $this->dewidow($unlockedText);
|
|
$unlockedText = $this->space_collapse($unlockedText);
|
|
|
|
//everything that requires HTML injection occurs here (functions above assume tag-free content)
|
|
//pay careful attention to functions below for tolerance of injected tags
|
|
$unlockedText = $this->smart_ordinal_suffix($unlockedText); // call before "style_numbers" and "smart_fractions"
|
|
$unlockedText = $this->smart_exponents($unlockedText); // call before "style_numbers"
|
|
$unlockedText = $this->smart_fractions($unlockedText); // call before "style_numbers" and after "smart_ordinal_suffix"
|
|
if(!$this->parsedHTML->in_class('caps', $unlockedText))
|
|
$unlockedText = $this->style_caps($unlockedText); // call before "style_numbers"
|
|
if(!$this->parsedHTML->in_class('numbers', $unlockedText))
|
|
$unlockedText = $this->style_numbers($unlockedText); // call after "smart_ordinal_suffix", "smart_exponents", "smart_fractions", and "style_caps"
|
|
if(!$this->parsedHTML->in_class('amp', $unlockedText))
|
|
$unlockedText = $this->style_ampersands($unlockedText);
|
|
if(!$this->parsedHTML->in_class(array('quo','dquo'), $unlockedText))
|
|
$unlockedText = $this->style_initial_quotes($unlockedText, $isTitle);
|
|
}
|
|
|
|
$this->parsedHTML->update($unlockedTexts);
|
|
return $this->parsedHTML->unload();
|
|
}
|
|
|
|
|
|
# Action: modifies $html according to the defined settings as only appropriate for RSS feeds
|
|
# (i.e. excluding processes that may not display well with limited character set inteligence)
|
|
# Returns: processed $html
|
|
function process_feed($html, $isTitle = FALSE) {
|
|
|
|
if( isset($this->settings["ignoreTags"]) && $isTitle && ( in_array('h1', $this->settings["ignoreTags"]) || in_array('h2', $this->settings["ignoreTags"]) ) )
|
|
return $html;
|
|
|
|
require_once("php-parser/php-parser.php");
|
|
|
|
// parse the html
|
|
$this->parsedHTML = new parseHTML();
|
|
$this->parsedHTML->load($html);
|
|
$this->parsedHTML->unlock_text();
|
|
$tagsToIgnore = $this->parsedHTML->get_tags_by_name($this->settings["ignoreTags"]);
|
|
if(isset($this->settings["ignoreClasses"]))
|
|
$tagsToIgnore += $this->parsedHTML->get_tags_by_class($this->settings["ignoreClasses"]); //union to avoid dup keys
|
|
if(isset($this->settings["ignoreIDs"]))
|
|
$tagsToIgnore += $this->parsedHTML->get_tag_by_id($this->settings["ignoreIDs"]); //union to avoid dup keys
|
|
$this->parsedHTML->lock_children($tagsToIgnore);
|
|
$unlockedTexts = $this->parsedHTML->get_unlocked_text();
|
|
|
|
foreach($unlockedTexts as &$unlockedText) {
|
|
|
|
// we won't be doing anything with spaces, so we can jump ship if that is all we have
|
|
if (0 == strlen(trim($unlockedText["value"]))) continue;
|
|
|
|
// decode all characters except < > &
|
|
$unlockedText["value"] = html_entity_decode($unlockedText["value"], ENT_QUOTES, "UTF-8"); //converts all HTML entities to their applicable characters
|
|
$unlockedText["value"] = htmlspecialchars($unlockedText["value"], ENT_NOQUOTES, "UTF-8"); //returns < > & to encoded HTML characters (< > and & respectively)
|
|
|
|
// modify anything that requires adjacent text awareness here
|
|
$unlockedText = $this->smart_quotes($unlockedText);
|
|
$unlockedText = $this->smart_dashes($unlockedText);
|
|
$unlockedText = $this->smart_ellipses($unlockedText);
|
|
$unlockedText = $this->smart_marks($unlockedText);
|
|
}
|
|
|
|
// add $initialChrs and $widows back into $unlockedTexts;
|
|
|
|
$this->parsedHTML->update($unlockedTexts);
|
|
return $this->parsedHTML->unload();
|
|
}
|
|
|
|
|
|
|
|
|
|
#=======================================================================
|
|
#=======================================================================
|
|
#== OTHER METHODS
|
|
#=======================================================================
|
|
#=======================================================================
|
|
|
|
//expecting parsedHTML token of type text
|
|
function smart_quotes($parsedHTMLtoken) {
|
|
if(!isset($this->settings["smartQuotes"]) || !$this->settings["smartQuotes"]) return $parsedHTMLtoken;
|
|
|
|
$nonEnglishWordCharacters = "
|
|
[0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}|
|
|
\x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}|
|
|
\x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}|
|
|
\x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}|
|
|
\x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}|
|
|
\x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}|
|
|
\x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}|
|
|
\x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}|
|
|
\x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}|
|
|
\x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}|
|
|
\x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}|
|
|
\x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}|
|
|
\x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}|
|
|
\x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}|
|
|
\x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}|
|
|
\x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}|
|
|
\x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}|
|
|
\x{017c}|\x{017d}|\x{017e}|\x{017f}
|
|
";
|
|
|
|
//need to get context of adjacent characters outside adjacent inline tags or HTML comment
|
|
//if we have adjacent characters add them to the text
|
|
$nextChr = "";
|
|
$prevChr = "";
|
|
if(isset($parsedHTMLtoken["prevChr"]) && $parsedHTMLtoken["prevChr"] != "") {
|
|
$prevChr = $parsedHTMLtoken["prevChr"];
|
|
$parsedHTMLtoken["value"] = $prevChr.$parsedHTMLtoken["value"];
|
|
}
|
|
if(isset($parsedHTMLtoken["nextChr"]) && $parsedHTMLtoken["nextChr"] != "") {
|
|
$nextChr = $parsedHTMLtoken["nextChr"];
|
|
$parsedHTMLtoken["value"] = $parsedHTMLtoken["value"].$nextChr;
|
|
}
|
|
////Logic
|
|
|
|
// before primes, handle quoted numbers
|
|
$parsedHTMLtoken["value"] = preg_replace("/(?<=\W|\A)'(\d+)'(?=\W|\Z)/u", $this->chr["singleQuoteOpen"].'$1'.$this->chr["singleQuoteClose"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(?<=\W|\A)\"(\d+)\"(?=\W|\Z)/u", $this->chr["doubleQuoteOpen"].'$1'.$this->chr["doubleQuoteClose"], $parsedHTMLtoken["value"]);
|
|
|
|
// guillemets
|
|
$parsedHTMLtoken["value"] = str_replace("<<", $this->chr["guillemetOpen"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace("<<", $this->chr["guillemetOpen"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace(">>", $this->chr["guillemetClose"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace(">>", $this->chr["guillemetClose"], $parsedHTMLtoken["value"]);
|
|
|
|
|
|
// primes
|
|
$parsedHTMLtoken["value"] = preg_replace("/(\b\d+)''(?=\W|\Z)/u", '$1'.$this->chr["doublePrime"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(\b\d+)\"(?=\W|\Z)/u", '$1'.$this->chr["doublePrime"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(\b\d+)'(?=\W|\Z)/u", '$1'.$this->chr["singlePrime"], $parsedHTMLtoken["value"]);
|
|
|
|
// backticks
|
|
$parsedHTMLtoken["value"] = str_replace("``", $this->chr["doubleQuoteOpen"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace("`", $this->chr["singleQuoteOpen"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace("''", $this->chr["doubleQuoteClose"], $parsedHTMLtoken["value"]);
|
|
|
|
// comma quotes
|
|
$parsedHTMLtoken["value"] =str_replace(",,", $this->chr["doubleLow9Quote"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(?<=\s|\A),(?=\S)/", $this->chr["singleLow9Quote"], $parsedHTMLtoken["value"]); //like _,¿hola?'_
|
|
|
|
// apostrophes
|
|
$parsedHTMLtoken["value"] = preg_replace("/(?<=[\w|$nonEnglishWordCharacters])'(?=[\w|$nonEnglishWordCharacters])/u", $this->chr["apostrophe"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/'(\d\d\b)/", $this->chr["apostrophe"].'$1', $parsedHTMLtoken["value"]); // decades: '98
|
|
$exceptions = array("'tain".$this->chr["apostrophe"]."t", "'twere", "'twas", "'tis", "'til", "'bout", "'nuff", "'round", "'cause", "'splainin");
|
|
$replacements = array($this->chr["apostrophe"]."tain".$this->chr["apostrophe"]."t", $this->chr["apostrophe"]."twere", $this->chr["apostrophe"]."twas", $this->chr["apostrophe"]."tis", $this->chr["apostrophe"]."til", $this->chr["apostrophe"]."bout", $this->chr["apostrophe"]."nuff", $this->chr["apostrophe"]."round", $this->chr["apostrophe"]."cause", $this->chr["apostrophe"]."splainin");
|
|
$parsedHTMLtoken["value"] = str_replace($exceptions, $replacements, $parsedHTMLtoken["value"]);
|
|
|
|
//quotes
|
|
$quoteRules = array("['", "{'", "('", "']", "'}", "')", "[\"", "{\"", "(\"", "\"]", "\"}", "\")", "\"'", "'\"");
|
|
$quoteRulesReplace = array("[".$this->chr["singleQuoteOpen"], "{".$this->chr["singleQuoteOpen"], "(".$this->chr["singleQuoteOpen"], $this->chr["singleQuoteClose"]."]", $this->chr["singleQuoteClose"]."}", $this->chr["singleQuoteClose"].")", "[".$this->chr["doubleQuoteOpen"], "{".$this->chr["doubleQuoteOpen"], "(".$this->chr["doubleQuoteOpen"], $this->chr["doubleQuoteClose"]."]", $this->chr["doubleQuoteClose"]."}", $this->chr["doubleQuoteClose"].")", $this->chr["doubleQuoteOpen"].$this->chr["singleQuoteOpen"], $this->chr["singleQuoteClose"].$this->chr["doubleQuoteClose"]);
|
|
$parsedHTMLtoken["value"] =str_replace($quoteRules, $quoteRulesReplace, $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/'(?=[\w|$nonEnglishWordCharacters])/u", $this->chr["singleQuoteOpen"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(?<=[\w|$nonEnglishWordCharacters])'/u", $this->chr["singleQuoteClose"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(?<=\s|\A)'(?=\S)/", $this->chr["singleQuoteOpen"], $parsedHTMLtoken["value"]); //like _'¿hola?'_
|
|
$parsedHTMLtoken["value"] = preg_replace("/(?<=\S)'(?=\s|\Z)/", $this->chr["singleQuoteClose"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/\"(?=[\w|$nonEnglishWordCharacters])/u", $this->chr["doubleQuoteOpen"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(?<=[\w|$nonEnglishWordCharacters])\"/u", $this->chr["doubleQuoteClose"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(?<=\s|\A)\"(?=\S)/", $this->chr["doubleQuoteOpen"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(?<=\S)\"(?=\s|\Z)/", $this->chr["doubleQuoteClose"], $parsedHTMLtoken["value"]);
|
|
|
|
//quote catch-alls - assume left over quotes are closing - as this is often the most complicated position, thus most likely to be missed
|
|
$parsedHTMLtoken["value"] = str_replace("'", $this->chr["singleQuoteClose"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace('"', $this->chr["doubleQuoteClose"], $parsedHTMLtoken["value"]);
|
|
|
|
|
|
//if we have adjacent characters remove them from the text
|
|
$encodings = array("ASCII","UTF-8");
|
|
$e = mb_detect_encoding($parsedHTMLtoken["value"]."a", $encodings);// ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936
|
|
if(!isset($e) || $e == "") $e = "ASCII";
|
|
if($prevChr != "") {
|
|
$parsedHTMLtoken["value"] = mb_substr($parsedHTMLtoken["value"], 1, mb_strlen($parsedHTMLtoken["value"], $e), $e);
|
|
}
|
|
if($nextChr != "") {
|
|
$parsedHTMLtoken["value"] = mb_substr($parsedHTMLtoken["value"], 0, mb_strlen($parsedHTMLtoken["value"], $e)-1, $e);
|
|
}
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
//expecting parsedHTML token of type text
|
|
function smart_dashes($parsedHTMLtoken) {
|
|
if(!isset($this->settings["smartDashes"]) || !$this->settings["smartDashes"]) return $parsedHTMLtoken;
|
|
|
|
$nonEnglishWordCharacters = "
|
|
[0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}|
|
|
\x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}|
|
|
\x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}|
|
|
\x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}|
|
|
\x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}|
|
|
\x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}|
|
|
\x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}|
|
|
\x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}|
|
|
\x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}|
|
|
\x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}|
|
|
\x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}|
|
|
\x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}|
|
|
\x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}|
|
|
\x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}|
|
|
\x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}|
|
|
\x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}|
|
|
\x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}|
|
|
\x{017c}|\x{017d}|\x{017e}|\x{017f}
|
|
";
|
|
|
|
$parsedHTMLtoken["value"] = str_replace("---", $this->chr["emDash"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace(" -- ", " ".$this->chr["emDash"]." ", $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace("--", $this->chr["enDash"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace(" - ", " ".$this->chr["emDash"]." ", $parsedHTMLtoken["value"]);
|
|
|
|
$parsedHTMLtoken["value"] = preg_replace("/(\A|\s)\-([\w|$nonEnglishWordCharacters])/u", '$1'.$this->chr["enDash"].'$2', $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/([\w|$nonEnglishWordCharacters])\-(\Z|\s)/u", '$1'.$this->chr["enDash"].'$2', $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(\b\d+)\-(\d+\b)/", '$1'.$this->chr["enDash"].'$2', $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = preg_replace("/(\b\d{3})".$this->chr["enDash"]."(\d{4}\b)/", '$1'.$this->chr["noBreakHyphen"].'$2', $parsedHTMLtoken["value"]); // phone numbers
|
|
$parsedHTMLtoken["value"] = str_replace("xn".$this->chr["enDash"], "xn--", $parsedHTMLtoken["value"]);
|
|
|
|
|
|
// revert dates back to original formats
|
|
|
|
// YYYY-MM-DD
|
|
$pattern = "/
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
[12][0-9]{3}
|
|
)
|
|
[\-".$this->chr["enDash"]."]
|
|
(
|
|
(?:[0][1-9]|[1][0-2])
|
|
)
|
|
[\-".$this->chr["enDash"]."]
|
|
(
|
|
(?:[0][1-9]|[12][0-9]|[3][0-1])
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].")
|
|
)
|
|
/xu";
|
|
$parsedHTMLtoken["value"] = preg_replace($pattern, "$1-$2-$3", $parsedHTMLtoken["value"]);
|
|
|
|
// MM-DD-YYYY or DD-MM-YYYY
|
|
$pattern = "/
|
|
(?:
|
|
(?:
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
(?:[0]?[1-9]|[1][0-2])
|
|
)
|
|
[\-".$this->chr["enDash"]."]
|
|
(
|
|
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
)
|
|
|
|
|
(?:
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
[\-".$this->chr["enDash"]."]
|
|
(
|
|
(?:[0]?[1-9]|[1][0-2])
|
|
)
|
|
)
|
|
)
|
|
[\-".$this->chr["enDash"]."]
|
|
(
|
|
[12][0-9]{3}
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].")
|
|
)
|
|
/xu";
|
|
$parsedHTMLtoken["value"] = preg_replace($pattern, "$1$3-$2$4-$5", $parsedHTMLtoken["value"]);
|
|
|
|
// YYYY-MM or YYYY-DDDD next
|
|
$pattern = "/
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
[12][0-9]{3}
|
|
)
|
|
[\-".$this->chr["enDash"]."]
|
|
(
|
|
(?:
|
|
(?:[0][1-9]|[1][0-2])
|
|
|
|
|
(?:[0][0-9][1-9]|[1-2][0-9]{2}|[3][0-5][0-9]|[3][6][0-6])
|
|
)
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].")
|
|
)
|
|
/xu";
|
|
$parsedHTMLtoken["value"] = preg_replace($pattern, "$1-$2", $parsedHTMLtoken["value"]);
|
|
|
|
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
//expecting parsedHTML token of type text
|
|
function smart_ellipses($parsedHTMLtoken) {
|
|
if(!isset($this->settings["smartEllipses"]) || !$this->settings["smartEllipses"]) return $parsedHTMLtoken;
|
|
$parsedHTMLtoken["value"] = str_replace(array("....", ". . . .",), ".".$this->chr["ellipses"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace(array("...", ". . .",), $this->chr["ellipses"], $parsedHTMLtoken["value"]);
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
//expecting parsedHTML token of type text
|
|
function smart_diacritics($parsedHTMLtoken) {
|
|
if(!isset($this->settings["smartDiacritics"]) || !$this->settings["smartDiacritics"]) return $parsedHTMLtoken;
|
|
|
|
if( isset($this->settings["diacriticCustomReplacements"]) && ( count($this->settings["diacriticCustomReplacements"]) > 0 ) ) {
|
|
foreach($this->settings["diacriticCustomReplacements"] as $needle => $replacement) {
|
|
$parsedHTMLtoken["value"] = preg_replace("/\b$needle\b/", $replacement, $parsedHTMLtoken["value"]);
|
|
}
|
|
}
|
|
if( isset($this->settings["diacriticWords"]) && ( count($this->settings["diacriticWords"]) > 0 ) ) {
|
|
foreach($this->settings["diacriticWords"] as $needle => $replacement) {
|
|
$parsedHTMLtoken["value"] = preg_replace("/\b$needle\b/", $replacement, $parsedHTMLtoken["value"]);
|
|
}
|
|
}
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
|
|
|
|
|
|
//expecting parsedHTML token of type text
|
|
function smart_marks($parsedHTMLtoken) {
|
|
if(!isset($this->settings["smartMarks"]) || !$this->settings["smartMarks"]) return $parsedHTMLtoken;
|
|
$parsedHTMLtoken["value"] = str_replace(array("(c)", "(C)"), $this->chr["copyright"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace(array("(r)", "(R)"), $this->chr["registeredMark"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace(array("(p)", "(P)"), $this->chr["soundCopyMark"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace(array("(sm)", "(SM)"), $this->chr["serviceMark"], $parsedHTMLtoken["value"]);
|
|
$parsedHTMLtoken["value"] = str_replace(array("(tm)", "(TM)"), $this->chr["tradeMark"], $parsedHTMLtoken["value"]);
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
//expecting parsedHTML token of type text
|
|
function smart_math($parsedHTMLtoken) {
|
|
|
|
if(!isset($this->settings["smartMath"]) || !$this->settings["smartMath"]) return $parsedHTMLtoken;
|
|
|
|
//first, let's find math equations
|
|
$pattern = "/
|
|
(?<=\A|\s) # lookbehind assertion: proceeded by beginning of string or space
|
|
[\.,\'\"\¿\¡".$this->chr["ellipses"].$this->chr["singleQuoteOpen"].$this->chr["doubleQuoteOpen"].$this->chr["guillemetOpen"].$this->chr["guillemetClose"].$this->chr["singleLow9Quote"].$this->chr["doubleLow9Quote"]."]*
|
|
# allowed proceeding punctuation
|
|
[\-\(".$this->chr["minus"]."]* # optionally proceeded by dash, minus sign or open parenthesis
|
|
[0-9]+ # must begin with a number
|
|
(\.[0-9]+)? # optionally allow decimal values after first integer
|
|
( # followed by a math symbol and a number
|
|
[\/\*x\-+=\^".$this->chr["minus"].$this->chr["multiplication"].$this->chr["division"]."]
|
|
# allowed math symbols
|
|
[\-\(".$this->chr["minus"]."]* # opptionally preceeded by dash, minus sign or open parenthesis
|
|
[0-9]+ # must begin with a number
|
|
(\.[0-9]+)? # optionally allow decimal values after first integer
|
|
[\-\(\)".$this->chr["minus"]."]* # opptionally preceeded by dash, minus sign or parenthesis
|
|
)+
|
|
[\.,;:\'\"\?\!".$this->chr["ellipses"].$this->chr["singleQuoteClose"].$this->chr["doubleQuoteClose"].$this->chr["guillemetOpen"].$this->chr["guillemetClose"]."]*
|
|
# allowed trailing punctuation
|
|
(?=\Z|\s) # lookahead assertion: followed by end of string or space
|
|
/ux";
|
|
$parsedHTMLtoken["value"] = preg_replace_callback(
|
|
$pattern,
|
|
array($this, '_smart_math_callback'),
|
|
$parsedHTMLtoken["value"]
|
|
);
|
|
|
|
// revert 4-4 to plain minus-hyphen so as to not mess with ranges of numbers (i.e. pp. 46-50)
|
|
$pattern = "/
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
\d+
|
|
)
|
|
[\-".$this->chr["minus"]."]
|
|
(
|
|
\d+
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].")
|
|
)
|
|
/xu";
|
|
$parsedHTMLtoken["value"] = preg_replace($pattern, "$1-$2", $parsedHTMLtoken["value"]);
|
|
|
|
|
|
//revert fractions to basic slash
|
|
// we'll leave styling fractions to smart_fractions
|
|
$pattern = "/
|
|
(
|
|
(?<=\s|\A|\'|\"|".$this->chr["noBreakSpace"].")
|
|
\d+
|
|
)
|
|
".$this->chr["division"]."
|
|
(
|
|
\d+
|
|
(?:st|nd|rd|th)?
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].")
|
|
)
|
|
/xu";
|
|
$parsedHTMLtoken["value"] = preg_replace($pattern, "$1/$2", $parsedHTMLtoken["value"]);
|
|
|
|
|
|
// revert date back to original formats
|
|
|
|
// YYYY-MM-DD
|
|
$pattern = "/
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
[12][0-9]{3}
|
|
)
|
|
[\-".$this->chr["minus"]."]
|
|
(
|
|
(?:[0]?[1-9]|[1][0-2])
|
|
)
|
|
[\-".$this->chr["minus"]."]
|
|
(
|
|
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].")
|
|
)
|
|
/xu";
|
|
$parsedHTMLtoken["value"] = preg_replace($pattern, "$1-$2-$3", $parsedHTMLtoken["value"]);
|
|
|
|
// MM-DD-YYYY or DD-MM-YYYY
|
|
$pattern = "/
|
|
(?:
|
|
(?:
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
(?:[0]?[1-9]|[1][0-2])
|
|
)
|
|
[\-".$this->chr["minus"]."]
|
|
(
|
|
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
)
|
|
|
|
|
(?:
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
(?:[0]?[1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
[\-".$this->chr["minus"]."]
|
|
(
|
|
(?:[0]?[1-9]|[1][0-2])
|
|
)
|
|
)
|
|
)
|
|
[\-".$this->chr["minus"]."]
|
|
(
|
|
[12][0-9]{3}
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].")
|
|
)
|
|
/xu";
|
|
$parsedHTMLtoken["value"] = preg_replace($pattern, "$1$3-$2$4-$5", $parsedHTMLtoken["value"]);
|
|
|
|
// YYYY-MM or YYYY-DDD next
|
|
$pattern = "/
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
[12][0-9]{3}
|
|
)
|
|
[\-".$this->chr["minus"]."]
|
|
(
|
|
(?:
|
|
(?:[0][1-9]|[1][0-2])
|
|
|
|
|
(?:[0][0-9][1-9]|[1-2][0-9]{2}|[3][0-5][0-9]|[3][6][0-6])
|
|
)
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].")
|
|
)
|
|
/xu";
|
|
|
|
// MM/DD/YYYY or DD/MM/YYYY
|
|
$pattern = "/
|
|
(?:
|
|
(?:
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
(?:[0][1-9]|[1][0-2])
|
|
)
|
|
[\/".$this->chr["division"]."]
|
|
(
|
|
(?:[0][1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
)
|
|
|
|
|
(?:
|
|
(
|
|
(?<=\s|\A|".$this->chr["noBreakSpace"].")
|
|
(?:[0][1-9]|[12][0-9]|[3][0-1])
|
|
)
|
|
[\/".$this->chr["division"]."]
|
|
(
|
|
(?:[0][1-9]|[1][0-2])
|
|
)
|
|
)
|
|
)
|
|
[\/".$this->chr["division"]."]
|
|
(
|
|
[12][0-9]{3}
|
|
(?=\s|\Z|\)|\]|\.|\,|\?|\;|\:|\'|\"|\!|".$this->chr["noBreakSpace"].")
|
|
)
|
|
/xu";
|
|
$parsedHTMLtoken["value"] = preg_replace($pattern, "$1$3/$2$4/$5", $parsedHTMLtoken["value"]);
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
function _smart_math_callback($matches) {
|
|
$matches[0] = str_replace("-", $this->chr["minus"], $matches[0]);
|
|
$matches[0] = str_replace("/", $this->chr["division"], $matches[0]);
|
|
$matches[0] = str_replace("x", $this->chr["multiplication"], $matches[0]);
|
|
$matches[0] = str_replace("*", $this->chr["multiplication"], $matches[0]);
|
|
return $matches[0];
|
|
}
|
|
|
|
//expecting parsedHTML token of type text
|
|
// purposefully seperatred from smart_math because of HTML code injection
|
|
function smart_exponents($parsedHTMLtoken) {
|
|
if(!isset($this->settings["smartExponents"]) || !$this->settings["smartExponents"]) return $parsedHTMLtoken;
|
|
|
|
//handle exponents (ie. 4^2)
|
|
$pat = "/
|
|
\b
|
|
(\d+)
|
|
\^
|
|
(\w+)
|
|
\b
|
|
/xu";
|
|
$parsedHTMLtoken["value"] = preg_replace($pat, '$1<sup>$2</sup>', $parsedHTMLtoken["value"]);
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
// expecting parsedHTML token of type text
|
|
// call before sytle_numbers
|
|
// call after smart_ordinal_suffix
|
|
// purposefully seperatred from smart_math because of HTML code injection
|
|
function smart_fractions($parsedHTMLtoken) {
|
|
if((!isset($this->settings["smartFractions"]) || !$this->settings["smartFractions"]) && (!isset($this->settings["fractionSpacing"]) || !$this->settings["fractionSpacing"])) return $parsedHTMLtoken;
|
|
|
|
$pat = "/\b(\d+)\s(\d+\s?\/\s?\d+)\b/";
|
|
if((isset($this->settings["fractionSpacing"]) && $this->settings["fractionSpacing"]) && (isset($this->settings["smartFractions"]) && $this->settings["smartFractions"])) {
|
|
$parsedHTMLtoken["value"] = preg_replace($pat, '$1'.$this->chr["noBreakNarrowSpace"].'$2', $parsedHTMLtoken["value"]);
|
|
} elseif((isset($this->settings["fractionSpacing"]) && $this->settings["fractionSpacing"]) && (!isset($this->settings["fractionSpacing"]) || !$this->settings["smartFractions"])) {
|
|
$parsedHTMLtoken["value"] = preg_replace($pat, '$1'.$this->chr["noBreakSpace"].'$2', $parsedHTMLtoken["value"]);
|
|
}
|
|
|
|
if(isset($this->settings["smartFractions"]) && $this->settings["smartFractions"]) {
|
|
// because without simple variables, the pattern fails...
|
|
$nbsp = $this->chr['noBreakSpace'];
|
|
$nbnsp = $this->chr['noBreakNarrowSpace'];
|
|
$pat = "/
|
|
(?<=\A|\s|$nbsp|$nbnsp) # lookbehind assertion: makes sure we are not messing up a url
|
|
(\d+)
|
|
(?:\s?\/\s?".$this->chr["zeroWidthSpace"].") # strip out any zero-width spaces inserted by wrap_hard_hyphens
|
|
(\d+)
|
|
(
|
|
(?:\<sup\>(?:st|nd|rd|th)<\/sup\>)? # handle ordinals after fractions
|
|
(?:\Z|\s|$this->chr['noBreakSpace']|$this->chr['noBreakNarrowSpace']|\.|\!|\?|\)|\;|\:|\'|\") # makes sure we are not messing up a url
|
|
)
|
|
/xu";
|
|
|
|
$parsedHTMLtoken["value"] = preg_replace($pat, '<sup>$1</sup>'.$this->chr["fractionSlash"].'<sub>$2</sub>$3', $parsedHTMLtoken["value"]);
|
|
}
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
//DEPRECIATED!!
|
|
//expecting parsedHTML token of type text
|
|
function smart_multiplication($parsedHTMLtoken) {
|
|
return $this->smart_math($parsedHTMLtoken);
|
|
}
|
|
|
|
// expecting parsedHTML token of type text
|
|
// call before sytle_numbers
|
|
function smart_ordinal_suffix($parsedHTMLtoken) {
|
|
if(!isset($this->settings["smartOrdinalSuffix"]) || !$this->settings["smartOrdinalSuffix"]) return $parsedHTMLtoken;
|
|
|
|
$parsedHTMLtoken["value"] = preg_replace("/\b(\d+)(st|nd|rd|th)\b/", '$1'.'<sup>$2</sup>', $parsedHTMLtoken["value"]);
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
|
|
|
|
//expecting parsedHTML token of type text
|
|
function single_character_word_spacing($parsedHTMLtoken) {
|
|
if(!isset($this->settings["singleCharacterWordSpacing"]) || !$this->settings["singleCharacterWordSpacing"]) return $parsedHTMLtoken;
|
|
|
|
// add $nextChr and $prevChr for context
|
|
$nextChr = "";
|
|
$prevChr = "";
|
|
if(isset($parsedHTMLtoken["prevChr"]) && $parsedHTMLtoken["prevChr"] != "") {
|
|
$prevChr = $parsedHTMLtoken["prevChr"];
|
|
$parsedHTMLtoken["value"] = $prevChr.$parsedHTMLtoken["value"];
|
|
}
|
|
if(isset($parsedHTMLtoken["nextChr"]) && $parsedHTMLtoken["nextChr"] != "") {
|
|
$nextChr = $parsedHTMLtoken["nextChr"];
|
|
$parsedHTMLtoken["value"] = $parsedHTMLtoken["value"].$nextChr;
|
|
}
|
|
|
|
$parsedHTMLtoken["value"] = preg_replace(
|
|
"/
|
|
(?:
|
|
(\s)
|
|
(\w)
|
|
\s
|
|
(?=\w)
|
|
)
|
|
/xu",
|
|
'$1$2'.$this->chr['noBreakSpace'],
|
|
$parsedHTMLtoken["value"]
|
|
);
|
|
|
|
//if we have adjacent characters remove them from the text
|
|
$encodings = array("ASCII","UTF-8");
|
|
$e = mb_detect_encoding($parsedHTMLtoken["value"]."a", $encodings);// ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936
|
|
if(!isset($e) || $e == "") $e = "ASCII";
|
|
if($prevChr != "") {
|
|
$parsedHTMLtoken["value"] = mb_substr($parsedHTMLtoken["value"], 1, mb_strlen($parsedHTMLtoken["value"], $e), $e);
|
|
}
|
|
if($nextChr != "") {
|
|
$parsedHTMLtoken["value"] = mb_substr($parsedHTMLtoken["value"], 0, mb_strlen($parsedHTMLtoken["value"], $e)-1, $e);
|
|
}
|
|
|
|
return $parsedHTMLtoken;
|
|
|
|
}
|
|
|
|
|
|
|
|
//expecting parsedHTML token of type text
|
|
function dash_spacing($parsedHTMLtoken) {
|
|
if(!isset($this->settings["dashSpacing"]) || !$this->settings["dashSpacing"]) return $parsedHTMLtoken;
|
|
$parsedHTMLtoken["value"] = preg_replace(
|
|
"/
|
|
(?:
|
|
\s
|
|
(".$this->chr['emDash'].")
|
|
\s
|
|
)
|
|
|
|
|
(?:
|
|
(?<=\S) # lookbehind assertion
|
|
(".$this->chr['emDash'].")
|
|
(?=\S) # lookahead assertion
|
|
)
|
|
/xu",
|
|
$this->chr['thinSpace'].'$1$2'.$this->chr['thinSpace'],
|
|
$parsedHTMLtoken["value"]
|
|
);
|
|
|
|
$parsedHTMLtoken["value"] = preg_replace(
|
|
"/
|
|
(?:
|
|
\s
|
|
(".$this->chr['enDash'].")
|
|
\s
|
|
)
|
|
|
|
|
(?:
|
|
(?<=\S) # lookbehind assertion
|
|
(".$this->chr['enDash'].")
|
|
(?=\S) # lookahead assertion
|
|
)
|
|
/xu",
|
|
$this->chr['thinSpace'].'$1$2'.$this->chr['thinSpace'],
|
|
$parsedHTMLtoken["value"]
|
|
);
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
|
|
|
|
//expecting parsedHTML token of type text
|
|
function space_collapse($parsedHTMLtoken) {
|
|
if(!isset($this->settings["spaceCollapse"]) || !$this->settings["spaceCollapse"]) return $parsedHTMLtoken;
|
|
|
|
|
|
# find the HTML character representation for the following characters:
|
|
# tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace
|
|
# ogham space mark | en quad space | em quad space | en-space | three-per-em space
|
|
# four-per-em space | six-per-em space | figure space | punctuation space | em-space
|
|
# thin space | hair space | narrow no-break space
|
|
# medium mathematical space | ideographic space
|
|
# Some characters are used inside words, we will not count these as a space for the purpose
|
|
# of finding word boundaries:
|
|
# zero-width-space ("​", "​")
|
|
# zero-width-joiner ("‌", "‌", "‍")
|
|
# zero-width-non-joiner ("‍", "‍", "‌")
|
|
|
|
$htmlSpaces = '
|
|
\x{00a0} # no-break space
|
|
|
|
|
\x{1361} # ethiopic wordspace
|
|
|
|
|
\x{2000} # en quad-space
|
|
|
|
|
\x{2001} # em quad-space
|
|
|
|
|
\x{2002} # en space
|
|
|
|
|
\x{2003} # em space
|
|
|
|
|
\x{2004} # three-per-em space
|
|
|
|
|
\x{2005} # four-per-em space
|
|
|
|
|
\x{2006} # six-per-em space
|
|
|
|
|
\x{2007} # figure space
|
|
|
|
|
\x{2008} # punctuation space
|
|
|
|
|
\x{2009} # thin space
|
|
|
|
|
\x{200a} # hair space
|
|
|
|
|
\x{200b} # zero-width space
|
|
|
|
|
\x{200c} # zero-width joiner
|
|
|
|
|
\x{200d} # zero-width non-joiner
|
|
|
|
|
\x{202f} # narrow no-break space
|
|
|
|
|
\x{205f} # medium mathematical space
|
|
|
|
|
\x{3000} # ideographic space
|
|
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8)
|
|
|
|
// normal spacing
|
|
$parsedHTMLtoken["value"] = preg_replace(
|
|
"/\s+/xu",
|
|
" ",
|
|
$parsedHTMLtoken["value"]
|
|
);
|
|
|
|
// nbsp get's priority. if nbsp exists in a string of spaces, it collapses to nbsp
|
|
$parsedHTMLtoken["value"] = preg_replace(
|
|
"/(?:\s|$htmlSpaces)*".$this->chr["noBreakSpace"]."(?:\s|$htmlSpaces)*/xu",
|
|
$this->chr["noBreakSpace"],
|
|
$parsedHTMLtoken["value"]
|
|
);
|
|
|
|
// for any other spaceing, replace with the first occurance of an unusual space character
|
|
$parsedHTMLtoken["value"] = preg_replace(
|
|
"/(?:\s)*($htmlSpaces)(?:\s|$htmlSpaces)*/xu",
|
|
"$1",
|
|
$parsedHTMLtoken["value"]
|
|
);
|
|
|
|
// remove all spacing at beginning of block level elements
|
|
if(!isset($parsedHTMLtoken["prevChr"]) || $parsedHTMLtoken["prevChr"] == NULL) { // we have the first text in a block level element
|
|
$parsedHTMLtoken["value"] = preg_replace(
|
|
"/\A(?:\s|$htmlSpaces)+/xu",
|
|
"",
|
|
$parsedHTMLtoken["value"]
|
|
);
|
|
}
|
|
/**/
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
|
|
|
|
|
|
//expecting parsedHTML token of type text
|
|
function unit_spacing($parsedHTMLtoken) {
|
|
if(!isset($this->settings["unitSpacing"]) || !$this->settings["unitSpacing"]) return $parsedHTMLtoken;
|
|
|
|
$units = array();
|
|
if(isset($this->settings["units"])) {
|
|
foreach($this->settings["units"] as $unit) {
|
|
$units[] = preg_replace("#([\[\\\^\$\.\|\?\*\+\(\)\{\}])#", "\\\\$1", $unit ); // escape special chrs
|
|
}
|
|
}
|
|
|
|
$customUnits = implode("|", $units);
|
|
$customUnits .= ($customUnits) ? "|" : "" ;
|
|
$unitPattern = $customUnits.'
|
|
|
|
### Temporal units
|
|
(?:ms|s|secs?|mins?|hrs?)\.?|
|
|
milliseconds?|seconds?|minutes?|hours?|days?|years?|decades?|century|centuries|millennium|millennia|
|
|
|
|
### Imperial units
|
|
(?:in|ft|yd|mi)\.?|
|
|
(?:ac|ha|oz|pt|qt|gal|lb|st)\.?
|
|
s\.f\.|sf|s\.i\.|si|square[ ]feet|square[ ]foot|
|
|
inch|inches|foot|feet|yards?|miles?|acres?|hectares?|ounces?|pints?|quarts?|gallons?|pounds?|stones?|
|
|
|
|
### Metric units (with prefixes)
|
|
(?:p|µ|[mcdhkMGT])?
|
|
(?:[mgstAKNJWCVFSTHBL]|mol|cd|rad|Hz|Pa|Wb|lm|lx|Bq|Gy|Sv|kat|Ω|Ohm|Ω|&\#0*937;|&\#[xX]0*3[Aa]9;)|
|
|
(?:nano|micro|milli|centi|deci|deka|hecto|kilo|mega|giga|tera)?
|
|
(?:liters?|meters?|grams?|newtons?|pascals?|watts?|joules?|amperes?)|
|
|
|
|
### Computers units (KB, Kb, TB, Kbps)
|
|
[kKMGT]?(?:[oBb]|[oBb]ps|flops)|
|
|
|
|
### Money
|
|
¢|M?(?:£|¥|€|$)|
|
|
|
|
### Other units
|
|
°[CF]? |
|
|
%|pi|M?px|em|en|[NSEOW]|[NS][EOW]|mbar
|
|
|
|
'; // required modifiers: x (multiline pattern)
|
|
|
|
$parsedHTMLtoken["value"] = preg_replace("/(\d\.?)\s($unitPattern)\b/x", '$1'.$this->chr["noBreakNarrowSpace"].'$2', $parsedHTMLtoken["value"]);
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
//expecting parsedHTML token of type text
|
|
function wrap_hard_hyphens($parsedTextTokens) {
|
|
if((isset($this->settings["hyphenHardWrap"]) && $this->settings["hyphenHardWrap"]) || (isset($this->settings["smartDashes"]) && $this->settings["smartDashes"])) {
|
|
foreach($parsedTextTokens as &$parsedTextToken) {
|
|
if(isset($this->settings["hyphenHardWrap"]) && $this->settings["hyphenHardWrap"]) {
|
|
$hyphens = array('-',$this->chr["hyphen"]);
|
|
$parsedTextToken["value"] = str_replace($hyphens, "-".$this->chr["zeroWidthSpace"], $parsedTextToken["value"]);
|
|
$parsedTextToken["value"] = str_replace("_", "_".$this->chr["zeroWidthSpace"], $parsedTextToken["value"]);
|
|
$parsedTextToken["value"] = str_replace("/", "/".$this->chr["zeroWidthSpace"], $parsedTextToken["value"]);
|
|
}
|
|
if(isset($this->settings["smartDashes"]) && $this->settings["smartDashes"]) // handled here because we need to know we are inside a word and not a url
|
|
$parsedTextToken["value"] = str_replace("-", $this->chr["hyphen"], $parsedTextToken["value"]);
|
|
}
|
|
}
|
|
return $parsedTextTokens;
|
|
}
|
|
|
|
//expecting parsedHTML token of type text
|
|
function dewidow($parsedHTMLtoken) {
|
|
// intervening inline tags may interfere with widow identification, but that is a sacrifice of using the parser
|
|
// intervening tags will only interfere if they separate the widow from previous or preceding whitespace
|
|
if(!isset($this->settings["dewidow"]) || !$this->settings["dewidow"]) return $parsedHTMLtoken;
|
|
if(!isset($parsedHTMLtoken["nextChr"])) { // we have the last type "text" child of a block level element
|
|
$encodings = array("ASCII","UTF-8", "ISO-8859-1");
|
|
$encoding = mb_detect_encoding($parsedHTMLtoken["value"]."a", $encodings); // ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936
|
|
$u = '';
|
|
|
|
if("UTF-8" == $encoding) {
|
|
$u = "u";
|
|
if(!function_exists('mb_strlen')) return $parsedHTMLtoken;
|
|
} elseif("ASCII" != $encoding) {
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
$widowPattern = "/
|
|
(?:
|
|
\A
|
|
|
|
|
(?:
|
|
( #subpattern 1: space before
|
|
[\s".$this->chr["zeroWidthSpace"].$this->chr["softHyphen"]."]+
|
|
)
|
|
( #subpattern 2: neighbors widow (short as possible)
|
|
[^\s".$this->chr["zeroWidthSpace"].$this->chr["softHyphen"]."]+
|
|
)
|
|
)
|
|
)
|
|
( #subpattern 3: space between
|
|
[\s".$this->chr["noBreakSpace"]."]+
|
|
)
|
|
( #subpattern 4: widow
|
|
[^\s".$this->chr["zeroWidthSpace"]."]+?
|
|
)
|
|
( #subpattern 5: any trailing punctuation or spaces
|
|
[^\w]*
|
|
)
|
|
\Z
|
|
/x$u";
|
|
|
|
$parsedHTMLtoken["value"] = preg_replace_callback(
|
|
$widowPattern,
|
|
array($this, '_dewidow_callback'),
|
|
$parsedHTMLtoken["value"]
|
|
);
|
|
|
|
}
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
|
|
function _dewidow_callback($widow) {
|
|
if(!isset($this->settings["dewidowMaxPull"]) || !$this->settings["dewidowMaxPull"] || !isset($this->settings["dewidowMaxLength"]) || !$this->settings["dewidowMaxLength"]) return $widow[0];
|
|
|
|
$encodings = array("ASCII","UTF-8", "ISO-8859-1");
|
|
$multibyte = FALSE;
|
|
$encoding = mb_detect_encoding($widow[0]."a", $encodings); // ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936
|
|
if("UTF-8" == $encoding) $multibyte = TRUE;
|
|
|
|
// if we are here, we know that widows are being protected in some fashion
|
|
// with that, we will assert that widows should never be hyphenated or wrapped
|
|
// as such, we will strip soft hyphens and zero-width-spaces
|
|
$widow[4] = str_replace($this->chr["zeroWidthSpace"], "", $widow[4]);
|
|
$widow[4] = str_replace($this->chr["softHyphen"], "", $widow[4]);
|
|
|
|
// $widow[5] = preg_replace("/\s+/", $this->chr["noBreakSpace"], $widow[5]);
|
|
$widow[5] = mb_ereg_replace("/\s+/", $this->chr["noBreakSpace"], $widow[5], "p");; // fixes multibyte unicode corruption that occurs in some instances in the line above.
|
|
|
|
$widow[5] = str_replace($this->chr["zeroWidthSpace"], "", $widow[5]);
|
|
$widow[5] = str_replace($this->chr["softHyphen"], "", $widow[5]);
|
|
|
|
// eject if widows neighbor is proceeded by a no break space (the pulled text would be too long)
|
|
if($widow[1] == "" || strstr($this->chr["noBreakSpace"], $widow[1])) return $widow[1].$widow[2].$widow[3].$widow[4].$widow[5];
|
|
|
|
if($multibyte) {
|
|
// eject if widows neighbor length exceeds the max allowed or widow length exceeds max allowed
|
|
if(
|
|
($widow[2] != "" && mb_strlen($widow[2]) > $this->settings["dewidowMaxPull"])
|
|
||
|
|
mb_strlen($widow[4]) > $this->settings["dewidowMaxLength"]
|
|
)
|
|
return $widow[1].$widow[2].$widow[3].$widow[4].$widow[5];
|
|
} else {
|
|
// single byte version of previous
|
|
if(
|
|
($widow[2] != "" && strlen($widow[2]) > $this->settings["dewidowMaxPull"])
|
|
||
|
|
strlen($widow[4]) > $this->settings["dewidowMaxLength"]
|
|
)
|
|
return $widow[1].$widow[2].$widow[3].$widow[4].$widow[5];
|
|
}
|
|
|
|
// lets protect some widows!
|
|
return $widow[1].$widow[2].$this->chr["noBreakSpace"].$widow[4].$widow[5];
|
|
}
|
|
|
|
|
|
// expecting parsedText tokens
|
|
function wrap_urls($parsedTextTokens) {
|
|
if(!isset($this->settings["urlWrap"]) || !$this->settings["urlWrap"] || !isset($this->settings["urlMinAfterWrap"]) || !$this->settings["urlMinAfterWrap"]) return $parsedTextTokens;
|
|
|
|
|
|
// test for and parse urls
|
|
$validTLD = 'ac|ad|aero|ae|af|ag|ai|al|am|an|ao|aq|arpa|ar|asia|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|biz|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cat|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|com|coop|co|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|info|int|in|io|iq|ir|is|it|je|jm|jobs|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mobi|mo|mp|mq|mr|ms|mt|museum|mu|mv|mw|mx|my|mz|name|na|nc|net|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pro|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|travel|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw';
|
|
$urlScheme = '(?:https?|ftps?|file|nfs|feed|itms|itpc)';
|
|
$urlPattern = "(?:
|
|
\A
|
|
($urlScheme:\/\/)? # Subpattern 1: contains _http://_ if it exists
|
|
( # Subpattern 2: contains subdomains.domain.tld
|
|
(?:
|
|
[a-z0-9] # first chr of (sub)domain can not be a hyphen
|
|
[a-z0-9\-]{0,61} # middle chrs of (sub)domain may be a hyphen;
|
|
# limit qty of middle chrs so total domain does not exceed 63 chrs
|
|
[a-z0-9] # last chr of (sub)domain can not be a hyphen
|
|
\. # dot separator
|
|
)+
|
|
(?:
|
|
$validTLD # validates top level domain
|
|
)
|
|
(?: # optional port numbers
|
|
:
|
|
(?:
|
|
[1-5]?[0-9]{1,4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 655[0-2][0-9] | 6553[0-5]
|
|
)
|
|
)?
|
|
)
|
|
( # Subpattern 3: contains path following domain
|
|
(?:
|
|
\/ # marks nested directory
|
|
[a-z0-9\"\$\-_\.\+!\*\'\(\),;\?:@=&\#]+ # valid characters within directory structure
|
|
)*
|
|
[\/]? # trailing slash if any
|
|
)
|
|
\Z
|
|
)"; // required modifiers: x (multiline pattern) i (case insensitive)
|
|
|
|
foreach($parsedTextTokens as &$parsedTextToken) {
|
|
if(preg_match("`$urlPattern`xi", $parsedTextToken["value"], $urlMatch)) {
|
|
// $urlMatch[1] holds "http://"
|
|
// $urlMatch[2] holds "subdomains.domain.tld"
|
|
// $urlMatch[3] holds the path after the domain
|
|
|
|
$http = ($urlMatch[1]) ? $urlMatch[1].$this->chr["zeroWidthSpace"] : "" ;
|
|
|
|
$domainParts = preg_split('#(\-|\.)#', $urlMatch[2], -1, PREG_SPLIT_DELIM_CAPTURE);
|
|
|
|
//this is a hack, but it works
|
|
// first, we hyphenate each part
|
|
// we need it formated like a group of words
|
|
$parsedWordsLike = array();
|
|
foreach($domainParts as $key => $domainPart) {
|
|
$parsedWordsLike[$key]["value"] = $domainPart;
|
|
}
|
|
|
|
// do the hyphenation
|
|
$parsedWordsLike = $this->do_hyphenate($parsedWordsLike);
|
|
|
|
// restore format
|
|
foreach($parsedWordsLike as $key => $parsedWordLike) {
|
|
$domainParts[$key] = $parsedWordLike["value"];
|
|
}
|
|
foreach ($domainParts as $key => &$domainPart) {
|
|
//then we swap out each soft-hyphen" with a zero-space
|
|
$domainPart = str_replace($this->chr["softHyphen"], $this->chr["zeroWidthSpace"], $domainPart);
|
|
|
|
//we also insert zero-spaces before periods and hyphens
|
|
if($key > 0 && strlen($domainPart) == 1) {
|
|
$domainPart = $this->chr["zeroWidthSpace"].$domainPart;
|
|
}
|
|
}
|
|
|
|
//lastly let's recombine
|
|
$domain = implode($domainParts);
|
|
|
|
//break up the URL path to individual characters
|
|
$pathParts = str_split($urlMatch[3], 1);
|
|
$pathCount = count($pathParts);
|
|
$path = "";
|
|
for($i = 0; $i < $pathCount; $i++) {
|
|
$path .= (0 == $i || $pathCount - $i < $this->settings["urlMinAfterWrap"]) ? $pathParts[$i] : $this->chr["zeroWidthSpace"].$pathParts[$i];
|
|
}
|
|
|
|
$parsedTextToken["value"] = $http.$domain.$path;
|
|
}
|
|
}
|
|
|
|
return $parsedTextTokens;
|
|
}
|
|
|
|
// expecting parsedText tokens
|
|
function wrap_emails($parsedTextTokens) {
|
|
if(!isset($this->settings["emailWrap"]) || !$this->settings["emailWrap"]) return $parsedTextTokens;
|
|
// test for and parse urls
|
|
$validTLD = 'ac|ad|aero|ae|af|ag|ai|al|am|an|ao|aq|arpa|ar|asia|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|biz|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cat|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|com|coop|co|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|info|int|in|io|iq|ir|is|it|je|jm|jobs|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mobi|mo|mp|mq|mr|ms|mt|museum|mu|mv|mw|mx|my|mz|name|na|nc|net|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pro|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|travel|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw';
|
|
$emailPattern = "(?:
|
|
\A
|
|
[a-z0-9\!\#\$\%\&\'\*\+\/\=\?\^\_\`\{\|\}\~\-]+
|
|
(?:
|
|
\.
|
|
[a-z0-9\!\#\$\%\&\'\*\+\/\=\?\^\_\`\{\|\}\~\-]+
|
|
)*
|
|
@
|
|
(?:
|
|
[a-z0-9]
|
|
[a-z0-9\-]{0,61}
|
|
[a-z0-9]
|
|
\.
|
|
)+
|
|
(?:
|
|
$validTLD
|
|
)
|
|
\Z
|
|
)"; // required modifiers: x (multiline pattern) i (case insensitive)
|
|
|
|
foreach($parsedTextTokens as &$parsedTextToken) {
|
|
if(preg_match("/$emailPattern/xi", $parsedTextToken["value"], $urlMatch)) {
|
|
$parsedTextToken["value"] = preg_replace("/([^a-zA-Z])/", '$1'.$this->chr["zeroWidthSpace"], $parsedTextToken["value"]);
|
|
}
|
|
}
|
|
return $parsedTextTokens;
|
|
}
|
|
|
|
// expecting parsedHTML token of type text
|
|
// wraps words of all caps (may include numbers) in <span class="caps">
|
|
// only call if you are certain that no html tags have been injected containing capital letters
|
|
// call before style_numbers
|
|
function style_caps($parsedHTMLtoken) {
|
|
if(!isset($this->settings["styleCaps"]) || !$this->settings["styleCaps"]) return $parsedHTMLtoken;
|
|
|
|
// \p{Lu} equals upper case letters and should match non english characters; since PHP 4.4.0 and 5.1.0
|
|
// for more info, see http://www.regextester.com/pregsyntax.html#regexp.reference.unicode
|
|
$pattern = '
|
|
(?<![\w\-_'.$this->chr["zeroWidthSpace"].$this->chr["softHyphen"].'])
|
|
# negative lookbehind assertion
|
|
(
|
|
(?: # CASE 1: " 9A "
|
|
[0-9]+ # starts with at least one number
|
|
\p{Lu} # must contain at least one capital letter
|
|
(?:\p{Lu}|[0-9]|\-|_|'.$this->chr["zeroWidthSpace"].'|'.$this->chr["softHyphen"].')*
|
|
# may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens
|
|
)
|
|
|
|
|
(?: # CASE 2: " A9 "
|
|
\p{Lu} # starts with capital letter
|
|
(?:\p{Lu}|[0-9]) # must be followed a number or capital letter
|
|
(?:\p{Lu}|[0-9]|\-|_|'.$this->chr["zeroWidthSpace"].'|'.$this->chr["softHyphen"].')*
|
|
# may be followed by any number of numbers capital letters, hyphens, underscores, zero width spaces, or soft hyphens
|
|
|
|
)
|
|
)
|
|
(?![\w\-_'.$this->chr["zeroWidthSpace"].$this->chr["softHyphen"].'])
|
|
# negative lookahead assertion
|
|
'; // required modifiers: x (multiline pattern) u (utf8)
|
|
|
|
$parsedHTMLtoken["value"] = preg_replace("/$pattern/xu", '<span class="caps">$1</span>', $parsedHTMLtoken["value"]);
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
// expecting parsedHTML token of type text
|
|
// wraps numbers in <span class="numbers"> (even numbers that appear inside a word, i.e. A9 becomes A<span class="numbers">9</span>)
|
|
// call after style_caps so A9 becomes <span class="caps">A<span class="numbers">9</span></span>)
|
|
// only call if you are certain that no html tags have been injected containing numbers
|
|
// call after smart_fractions, smart_ordinal_suffix and style_caps
|
|
function style_numbers($parsedHTMLtoken) {
|
|
if(!isset($this->settings["styleNumbers"]) || !$this->settings["styleNumbers"]) return $parsedHTMLtoken;
|
|
|
|
$pattern = '([0-9]+)'; // required modifier: u (utf8)
|
|
$parsedHTMLtoken["value"] = preg_replace("/$pattern/u", '<span class="numbers">$1</span>', $parsedHTMLtoken["value"]);
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
// expecting parsedHTML token of type text
|
|
// wraps ampersands in <span class="amp"> (i.e. H&J becomes H<span class="amp">&</span>J)
|
|
// call after style_caps so H&J becomes <span class="caps">H<span class="amp">&</span>J</span>)
|
|
// note that all standalone ampersands were previously converted to &
|
|
// only call if you are certain that no html tags have been injected containing "&"
|
|
function style_ampersands($parsedHTMLtoken) {
|
|
if(!isset($this->settings["styleAmpersands"]) || !$this->settings["styleAmpersands"]) return $parsedHTMLtoken;
|
|
|
|
$pattern = '(\&\;)'; // required modifier: u (utf8)
|
|
$parsedHTMLtoken["value"] = preg_replace("/$pattern/u", '<span class="amp">$1</span>', $parsedHTMLtoken["value"]);
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
// expecting parsedHTML token of type text
|
|
// styles initial quotes and guillemets
|
|
function style_initial_quotes($parsedHTMLtoken, $isTitle = FALSE) {
|
|
if(!isset($this->settings["styleInitialQuotes"]) || !$this->settings["styleInitialQuotes"] || !isset($this->settings["initialQuoteTags"]) || !$this->settings["initialQuoteTags"]) return $parsedHTMLtoken;
|
|
|
|
if(!isset($parsedHTMLtoken["prevChr"]) || $parsedHTMLtoken["prevChr"] == NULL) { // we have the first text in a block level element
|
|
|
|
$encodings = array("ASCII","UTF-8", "ISO-8859-1");
|
|
$e = mb_detect_encoding($parsedHTMLtoken["value"]."a", $encodings);// ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936
|
|
if(!isset($e) || $e == "") $e = "ASCII";
|
|
|
|
$firstChr = mb_substr($parsedHTMLtoken["value"], 0, 1, $e);
|
|
if($firstChr == "'" || $firstChr == $this->chr["singleQuoteOpen"] || $firstChr == $this->chr["singleLow9Quote"] || $firstChr == "," || $firstChr == "\"" || $firstChr == $this->chr["doubleQuoteOpen"] || $firstChr == $this->chr["guillemetOpen"] || $firstChr == $this->chr["guillemetClose"] || $firstChr == $this->chr["doubleLow9Quote"]) {
|
|
|
|
$style = FALSE;
|
|
$immediateParent = "";
|
|
if($parsedHTMLtoken["parents"]) {
|
|
$immediateParent = end($parsedHTMLtoken["parents"]);
|
|
} elseif($isTitle) {
|
|
// assume page title is h2
|
|
$immediateParent = array("tagName" => "h2");
|
|
}
|
|
// TD throws warnings for friendica
|
|
// Warning: Illegal string offset 'tagName' in
|
|
// /addon/typography/php-typography/php-typography.php
|
|
// on line 1964
|
|
//if($immediateParent["tagName"]) {
|
|
// foreach($this->settings["initialQuoteTags"] as $tag) {
|
|
// if($tag == $immediateParent["tagName"])
|
|
// $style = TRUE;
|
|
// }
|
|
//}
|
|
|
|
if($style) {
|
|
if($firstChr == "'" || $firstChr == $this->chr["singleQuoteOpen"] || $firstChr == $this->chr["singleLow9Quote"] || $firstChr == ",") {
|
|
$parsedHTMLtoken["value"] = '<span class="quo">'.$firstChr.'</span>'.mb_substr($parsedHTMLtoken["value"], 1, mb_strlen($parsedHTMLtoken["value"], $e), $e);
|
|
} else { // double quotes or guillemets
|
|
$parsedHTMLtoken["value"] = '<span class="dquo">'.$firstChr.'</span>'.mb_substr($parsedHTMLtoken["value"], 1, mb_strlen($parsedHTMLtoken["value"], $e), $e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return $parsedHTMLtoken;
|
|
}
|
|
|
|
|
|
//injects the PatGen segments pattern into the PatGen words pattern
|
|
function hyphenation_pattern_injection($wordPattern, $segmentPattern, $segmentPosition, $segmentLength) {
|
|
|
|
for($numberPosition=$segmentPosition; $numberPosition <= $segmentPosition + $segmentLength; $numberPosition++) {
|
|
$wordPattern[$numberPosition] =
|
|
(intval($wordPattern[$numberPosition]) >= intval($segmentPattern[$numberPosition-$segmentPosition])) ?
|
|
$wordPattern[$numberPosition] :
|
|
$segmentPattern[$numberPosition-$segmentPosition];
|
|
}
|
|
return $wordPattern;
|
|
}
|
|
|
|
// expecting parseText tokens filtered to words
|
|
function hyphenate($parsedTextTokens, $isTitle = FALSE) {
|
|
if(!isset($this->settings["hyphenation"]) || !$this->settings["hyphenation"]) return $parsedTextTokens;
|
|
|
|
$isHeading = FALSE;
|
|
if(isset($parsedTextTokens["parents"])) {
|
|
foreach($parsedTextTokens["parents"] as $tagName) {
|
|
if($tagName == "h1" || $tagName == "h2" || $tagName == "h3" || $tagName == "h4" || $tagName == "h5" || $tagName == "h6") $isHeading = TRUE;
|
|
}
|
|
}
|
|
if((!isset($this->settings["hyphenateTitle"]) || !$this->settings["hyphenateTitle"]) && ($isTitle || $isHeading)) return $parsedTextTokens;
|
|
|
|
// call functionality as seperate function so it can be run without test for setting["hyphenation"] - such as with url wrapping
|
|
return $this->do_hyphenate($parsedTextTokens);
|
|
}
|
|
// expecting parsedText tokens filtered to words
|
|
function do_hyphenate($parsedTextTokens) {
|
|
if(!isset($this->settings["hyphenMinLength"]) || !$this->settings["hyphenMinLength"]) return $parsedTextTokens;
|
|
if(!isset($this->settings["hyphenMinBefore"]) || !$this->settings["hyphenMinBefore"]) return $parsedTextTokens;
|
|
if(!isset($this->settings["hyphenationPatternMaxSegment"])) return $parsedTextTokens;
|
|
if(!isset($this->settings["hyphenationPatternExceptions"])) return $parsedTextTokens;
|
|
if(!isset($this->settings["hyphenationPattern"])) return $parsedTextTokens;
|
|
|
|
$encodings = array("ASCII","UTF-8", "ISO-8859-1");
|
|
$multibyte = FALSE;
|
|
$u = "";
|
|
// make sure we have full exceptions list
|
|
if(!isset($this->settings["hyphenationExceptions"])) {
|
|
if($this->settings["hyphenationPatternExceptions"] || (isset($this->settings["hyphenationCustomExceptions"]) && $this->settings["hyphenationCustomExceptions"])) {
|
|
$exceptions = array();
|
|
if(isset($this->settings["hyphenationCustomExceptions"])) {
|
|
// merges custom and language specific word hyphenations
|
|
$exceptions = array_merge($this->settings["hyphenationCustomExceptions"], $this->settings["hyphenationPatternExceptions"]);
|
|
} else {
|
|
$exceptions = $this->settings["hyphenationPatternExceptions"];
|
|
}
|
|
|
|
$this->settings["hyphenationExceptions"] = $exceptions;
|
|
} else {
|
|
$this->settings["hyphenationExceptions"]=array();
|
|
}
|
|
}
|
|
foreach($parsedTextTokens as &$parsedTextToken) {
|
|
// ."a" is a hack; see http://www.php.net/manual/en/function.mb-detect-encoding.php#81936
|
|
$encoding = mb_detect_encoding($parsedTextToken["value"]."a", $encodings);
|
|
|
|
if("UTF-8" == $encoding) {
|
|
$multibyte = TRUE;
|
|
$u = "u";
|
|
if(!function_exists('mb_strlen')) continue;
|
|
} elseif("ASCII" != $encoding) {
|
|
continue;
|
|
}
|
|
|
|
if($multibyte) {
|
|
$wordLength = mb_strlen($parsedTextToken["value"], "UTF-8");
|
|
$theKey = mb_strtolower($parsedTextToken["value"], "UTF-8");
|
|
} else { //same as above without mutlibyte string functions to improve preformance
|
|
$wordLength = strlen($parsedTextToken["value"]);
|
|
$theKey = strtolower($parsedTextToken["value"]);
|
|
}
|
|
|
|
if($wordLength < $this->settings["hyphenMinLength"]) continue;
|
|
|
|
//if this is a capitalized word, and settings do not allow hyphenation of such, abort!
|
|
// note. this is different than uppercase words, where we are looking for title case
|
|
if((!isset($this->settings["hyphenateTitleCase"]) || !$this->settings["hyphenateTitleCase"]) && substr($theKey,0,1) != substr($parsedTextToken["value"],0,1)) continue;
|
|
|
|
// give exceptions preference
|
|
if(isset($this->settings["hyphenationExceptions"][$theKey])) {
|
|
//Set the wordPattern - this method keeps any contextually important capitalization
|
|
if($multibyte) {
|
|
$lowercaseHyphenedWord = $this->settings["hyphenationExceptions"][$theKey];
|
|
$lhwArray = $this->mb_str_split($lowercaseHyphenedWord, 1, "UTF-8");
|
|
$lhwLength = mb_strlen($lowercaseHyphenedWord, "UTF-8");
|
|
} else { //same as above without mutlibyte string functions to improve preformance
|
|
$lowercaseHyphenedWord = $this->settings["hyphenationExceptions"][$theKey];
|
|
$lhwArray = str_split($lowercaseHyphenedWord, 1);
|
|
$lhwLength = strlen($lowercaseHyphenedWord);
|
|
}
|
|
|
|
$wordPattern=array();
|
|
for($i=0; $i < $lhwLength; $i++) {
|
|
if("-" == $lhwArray[$i]) {
|
|
array_push($wordPattern, "9");
|
|
$i++;
|
|
} else {
|
|
array_push($wordPattern, "0");
|
|
}
|
|
}
|
|
array_push($wordPattern, "0"); //for consistent length with the other word patterns
|
|
}
|
|
if(!isset($wordPattern)) {
|
|
// first we set up the matching pattern to be a series of zeros one character longer than $parsedTextToken
|
|
$wordPattern = array();
|
|
for($i=0; $i < $wordLength +1; $i++) {
|
|
array_push($wordPattern, "0");
|
|
}
|
|
// we grab all possible segments from $parsedTextToken of length 2 through $this->settings["hyphenationPatternMaxSegment"]
|
|
for($segmentLength=2; ($segmentLength <= $wordLength) && ($segmentLength <= $this->settings["hyphenationPatternMaxSegment"]); $segmentLength++) {
|
|
for($segmentPosition=0; $segmentPosition + $segmentLength <= $wordLength; $segmentPosition++) {
|
|
if($multibyte)
|
|
$segment = mb_strtolower(mb_substr($parsedTextToken["value"], $segmentPosition, $segmentLength, "UTF-8"), "UTF-8");
|
|
else
|
|
$segment = strtolower(substr($parsedTextToken["value"], $segmentPosition, $segmentLength));
|
|
if(0 == $segmentPosition) {
|
|
if(isset($this->settings["hyphenationPattern"]["begin"][$segment])) {
|
|
if($multibyte)
|
|
$segmentPattern = $this->mb_str_split($this->settings["hyphenationPattern"]["begin"][$segment], 1, "UTF-8");
|
|
else
|
|
$segmentPattern = str_split($this->settings["hyphenationPattern"]["begin"][$segment], 1);
|
|
$wordPattern = $this->hyphenation_pattern_injection($wordPattern, $segmentPattern, $segmentPosition, $segmentLength);
|
|
}
|
|
}
|
|
if($segmentPosition + $segmentLength == $wordLength) {
|
|
if(isset($this->settings["hyphenationPattern"]["end"][$segment])) {
|
|
if($multibyte)
|
|
$segmentPattern = $this->mb_str_split($this->settings["hyphenationPattern"]["end"][$segment], 1, "UTF-8");
|
|
else
|
|
$segmentPattern = str_split($this->settings["hyphenationPattern"]["end"][$segment], 1);
|
|
$wordPattern = $this->hyphenation_pattern_injection($wordPattern, $segmentPattern, $segmentPosition, $segmentLength);
|
|
}
|
|
}
|
|
if(isset($this->settings["hyphenationPattern"]["all"][$segment])) {
|
|
if($multibyte)
|
|
$segmentPattern = $this->mb_str_split($this->settings["hyphenationPattern"]["all"][$segment], 1, "UTF-8");
|
|
else
|
|
$segmentPattern = str_split($this->settings["hyphenationPattern"]["all"][$segment], 1);
|
|
$wordPattern = $this->hyphenation_pattern_injection($wordPattern, $segmentPattern, $segmentPosition, $segmentLength);
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
//add soft-hyphen based on $wordPattern
|
|
if($multibyte) {
|
|
$wordArray = $this->mb_str_split($parsedTextToken["value"], 1, "UTF-8");
|
|
} else { //same as above without mutlibyte string functions to improve preformance
|
|
$wordArray = str_split($parsedTextToken["value"], 1);
|
|
}
|
|
|
|
$hyphenatedWord = "";
|
|
for($i=0; $i < $wordLength; $i++) {
|
|
if(($this->is_odd(intval($wordPattern[$i]))) && ($i >= $this->settings["hyphenMinBefore"]) && ($i < $wordLength - $this->settings["hyphenMinAfter"])) {
|
|
$hyphenatedWord .= $this->chr["softHyphen"].$wordArray[$i];
|
|
} else {
|
|
$hyphenatedWord .= $wordArray[$i];
|
|
}
|
|
}
|
|
|
|
$parsedTextToken["value"] = $hyphenatedWord;
|
|
unset($wordPattern);
|
|
}
|
|
return $parsedTextTokens;
|
|
}
|
|
|
|
########################################################################
|
|
# params: $codes = decimal value cooresponding to unicode character
|
|
# Returns: unicode character
|
|
function uchr ($codes) {
|
|
if (is_scalar($codes)) $codes= func_get_args();
|
|
$str= '';
|
|
foreach ($codes as $code) $str.= html_entity_decode('&#'.$code.';',ENT_NOQUOTES,'UTF-8');
|
|
return $str;
|
|
}
|
|
|
|
//is a number odd? returns 0 if even and 1 if odd
|
|
function is_odd($number) {
|
|
return $number % 2;
|
|
}
|
|
|
|
//multibyte character support is built in to accomodate language support of multibyte alphabets
|
|
function mb_str_split($str, $length = 1, $encoding = 'UTF-8') {
|
|
if(!function_exists('mb_strlen')) return FALSE;
|
|
if ($length < 1) return FALSE;
|
|
$result = array();
|
|
for ($i = 0; $i < mb_strlen($str, $encoding); $i += $length) {
|
|
$result[] = mb_substr($str, $i, $length, $encoding);
|
|
}
|
|
return $result;
|
|
}
|
|
|
|
|
|
|
|
|
|
##########################################################################################
|
|
##########################################################################################
|
|
##########################################################################################
|
|
###
|
|
### portions of this code have been inspired by:
|
|
### -typogrify (http://code.google.com/p/typogrify/)
|
|
### -WordPress code for wptexturize (http://xref.redalt.com/wptrunk/nav.htm?index.htm)
|
|
### -PHP SmartyPants Typographer (http://michelf.com/projects/php-smartypants/)
|
|
###
|
|
|
|
}
|