initial commit

This commit is contained in:
Tobias Diekershoff 2015-01-04 21:46:33 +01:00
commit 8eeedc3a3c
54 changed files with 179682 additions and 0 deletions

View file

@ -0,0 +1,37 @@
1.20 - December 20, 2009
Added HTML5 elements to parsing algorithm for greater contextual awareness
1.19 - December 1, 2009
Corrected some uninitiated variables
1.12 - August 17, 2009
Corrected multibyte handling of nextChr and prevChr
1.10 - August 14, 2009
Increased set of recognized multibyte word characters
Corrected multibyte handling of nextChr and prevChr
1.4 - July 23, 2009
Added letter connectors (like soft-hyphens) as prohibited characters for get_words if it is set to strictly return letter only words.
1.3 - July 23, 2009
Uninitialized variables corrected throughout.
1.0 - July 15, 2009
Removed beta tag
1.0 beta 7 - July 10, 2009
added "/" as a valid word character so we could capture "this/that" as a word for processing (similar to "mother-in-law")
Corrected error where characters from the Latin 1 Supplement Block were not recognized as word characters
1.0 beta 1
initial release

View file

@ -0,0 +1,1166 @@
<?php
/*
Project Name: PHP Parser
Project URI: http://kingdesk.com/projects/php-parser/
Author: Jeffrey D. King
Author URI: http://kingdesk.com/about/jeff/
Copyright 2009, KINGdesk, LLC. Licensed under the GNU General Public License 2.0. If you use, modify and/or redistribute this software, you must leave the KINGdesk, LLC copyright information, the request for a link to http://kingdesk.com, and the web design services contact information unchanged. If you redistribute this software, or any derivative, it must be released under the GNU General Public License 2.0. This program is distributed without warranty (implied or otherwise) of suitability for any particular purpose. See the GNU General Public License for full license terms <http://creativecommons.org/licenses/GPL/2.0/>.
WE DON'T WANT YOUR MONEY: NO TIPS NECESSARY! If you enjoy this plugin, a link to http://kingdesk.com from your website would be appreciated.
For web design services, please contact info@kingdesk.com.
*/
// first we define some constants
// Valid constant names
define("ALL_TAGS", 1);
define("OPENING_TAGS", 2);
define("CLOSING_TAGS", 3);
define("SELFCLOSING_TAGS", 4);
define("OPENING_AND_SELFCLOSING_TAGS", 5);
define("SELFCLOSING_AND_OPENING_TAGS", 5);
define("OPENING_AND_CLOSING_TAGS", 7);
define("CLOSING_AND_OPENING_TAGS", 7);
define("CLOSING_AND_SELFCLOSING_TAGS", 6);
define("SELFCLOSING_AND_CLOSING_TAGS", 6);
define("ALL_TOKENS", 1);
define("TEXT_TOKENS", 2);
define("TAG_TOKENS", 3);
define("COMMENT_TOKENS", 4);
define("CDATA_TOKENS", 5);
define("TEXT_AND_TAG_TOKENS", 6);
define("TAG_AND_TEXT_TOKENS", 6);
define("TEXT_AND_COMMENT_TOKENS", 7);
define("COMMENT_AND_TEXT_TOKENS", 7);
define("TEXT_AND_CDATA_TOKENS", 8);
define("CDATA_AND_TEXT_TOKENS", 8);
define("TAG_AND_COMMENT_TOKENS", 9);
define("COMMENT_AND_TAG_TOKENS", 9);
define("TAG_AND_CDATA_TOKENS", 10);
define("CDATA_AND_TAG_TOKENS", 10);
define("COMMENT_AND_CDATA_TOKENS", 11);
define("CDATA_AND_COMMENT_TOKENS", 11);
define("TEXT_TAG_AND_COMMENT_TOKENS", 12);
define("TEXT_COMMENT_AND_TAG_TOKENS", 12);
define("TAG_TEXT_AND_COMMENT_TOKENS", 12);
define("TAG_COMMENT_AND_TEXT_TOKENS", 12);
define("COMMENT_TAG_AND_TEXT_TOKENS", 12);
define("COMMENT_TEXT_AND_TAG_TOKENS", 12);
define("TEXT_TAG_AND_CDATA_TOKENS", 13);
define("TEXT_CDATA_AND_TAG_TOKENS", 13);
define("TAG_TEXT_AND_CDATA_TOKENS", 13);
define("TAG_CDATA_AND_TEXT_TOKENS", 13);
define("CDATA_TAG_AND_TEXT_TOKENS", 13);
define("CDATA_TEXT_AND_TAG_TOKENS", 13);
define("TEXT_COMMENT_AND_CDATA_TOKENS", 14);
define("TEXT_CDATA_AND_COMMENT_TOKENS", 14);
define("COMMENT_TEXT_AND_CDATA_TOKENS", 14);
define("COMMENT_CDATA_AND_TEXT_TOKENS", 14);
define("CDATA_COMMENT_AND_TEXT_TOKENS", 14);
define("CDATA_TEXT_AND_COMMENT_TOKENS", 14);
define("TAG_COMMENT_AND_CDATA_TOKENS", 15);
define("TAG_CDATA_AND_COMMENT_TOKENS", 15);
define("COMMENT_TAG_AND_CDATA_TOKENS", 15);
define("COMMENT_CDATA_AND_TAG_TOKENS", 15);
define("CDATA_COMMENT_AND_TAG_TOKENS", 15);
define("CDATA_TAG_AND_COMMENT_TOKENS", 15);
#########################################################################################################
#########################################################################################################
##
## parsedXHTML assumes valid XHTML:
## -every tag must be closed
## -every attribute must have a value
## -tag names and attributes are all lowercase
##
#########################################################################################################
#########################################################################################################
class parseHTML {
var $blockTags = array("address", "article", "aside", "blockquote", "center", "dd", "dialog", "dir", "div", "dl", "dt", "fieldset", "figure", "footer", "form", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "isindex", "li", "menu", "nav", "noframes", "noscript", "ol", "p", "pre", "section", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "ul");
var $html = array();
/*
$html is an ARRAY with the following structure:
index => ARRAY: tokenized XHTML
"type" => STRING: REQUIRED; "comment" | "dtd" | "cdata" | "xml" | "tag" | "text"
"value" => STRING: REQUIRED; token content
"name" => STRING: REQUIRED for type "tag"; element name
"openPos" => INTEGER: REQUIRED for closing tags (including self-closing); integer corresponding to the index of the opening tag
// if a closing tag is missing an opening match, it will be treated as self-closing
"closePos" => INTEGER: REQUIRED for opening and self-closing tags; integer corresponding to the index of the closing tag
// if an opening tag is missing a closing match, it will be treated as closed by its parent's closing tag (or end of string)
"attribute" => ARRAY: REQUIRED if "tag" has assigned attributes; attribute_names => values
"parents" => ARRAY: REQUIRED if "tag" has parent tag(s); parent tags: "index" => array("tagName" => tagName, "attributes" => array(name => value, ... ))
"locked" => BOOLEAN: OPTIONAL; TRUE by default for all types. It is never set to FALSE, it is just unset.
"ERROR" => STRING: error message (i.e. improperly nested tag...)
"prevChr" => CHARACTER: REQUIRED for type "text" if previous character exists; last character of previous "text" if separated by inline tags or HTML comments
"nextChr" => CHARACTER: REQUIRED for type "text" if next character exists; first character of next "text" if only separated by inline tags or HTML comments
*/
#=======================================================================
#=======================================================================
#== METHODS
#=======================================================================
#=======================================================================
########################################################################
# ( UN | RE )LOAD, UPDATE AND CLEAR METHODS
#
#
# Params: STRING containing HTML markup.
# Action: Tokenized $rawHTML saved to $this->html
# Returns: TRUE on completion
function load($rawHTML) {
$this->clear();
$tokens = array();
$index = 0;
$nestedTags = array(); // stores $index => "unclosed tag name"
# find HTML comments
$commentTag = '(?:<!(?:--.*?--\s*)+>)'; // required modifier: s (DotAll)
# find Document Type Definition
$dtdTag = '(?:<![-a-zA-Z0-9:]+\b(?:.*?(?:--.*?--\s*)?)*>)'; // required modifier: s (DotAll)
# find (Unparsed) Character Data
$cdataTag = '(?:<\[CDATA\[.*?\]\]>)'; // required modifier: s (DotAll)
# find XML Declaration
$xmlTag = '(?:<\?xml\s.*?\?>)'; // required modifier: s (DotAll)
# find XHTML Tags
$htmlTag = '(?:</?[-a-zA-Z0-9:]+\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)'; // required modifier: s (DotAll)
# find XHTML Tags with ability to grab tag name and test for closing tags
$htmlTagDetail = '
< # open of HTML element
(/)? # Subpattern 1: test for closing tag
([-a-zA-Z0-9:]+) # Subpattern 2: tag name
(?:
[^\'">]+ # matches any attribute names
|
"[^"]*" # double quoted attribute value
|
\'[^\']*\' # single quoted attribute value
)*
((?<=/)>)? # Subpattern 3: test for self-closing tag
'; //required modifiers: x
# find attribute/value pairs in HTML tags
$attributePattern= '
\s+ # one or more spaces
([-a-zA-Z0-9:]+) # Subpattern 1: attributeibute name
\s*=\s*
(?:
"([^"]+)" # Subpattern 2: possibly attribute value
|
\'([^\']+)\' # Subpattern 3: possibly attribute value
)
'; //required modifiers: x
# find Find any tag
$anyTag = "$commentTag|$dtdTag|$cdataTag|$xmlTag|$htmlTag"; // required modifiers: x (multiline pattern) s (DotAll)
$parts = preg_split("@($anyTag)@s", $rawHTML, -1, PREG_SPLIT_DELIM_CAPTURE);
// we will use "prevChr" and "nextChr" to give context to type "text"
// "prevChr" is not relevant to the first child of type "text" in a block level HTML element
// "nextChr" is not relevant to the last child of type "text" in a block level HTML element
// we will use $prevTextIndex to help us properly assign "prevChr" and "nextChr"
$prevTextIndex = NULL;
$i = 0;
foreach ($parts as $part) {
if ($part != "") {
if(preg_match("@\A$commentTag\Z@s", $part)) {
$tokens[$index] = array(
"type" => 'comment',
"value" => $part,
"locked" => TRUE,
);
// remember parents
if(!empty($nestedTags))
$tokens[$index]["parents"] = $nestedTags;
} elseif(preg_match("@\A$dtdTag\Z@s", $part)) {
$tokens[$index] = array(
"type" => 'dtd',
"value" => $part,
"locked" => TRUE,
);
// remember parents
if(!empty($nestedTags))
$tokens[$index]["parents"] = $nestedTags;
} elseif(preg_match("@\A$cdataTag\Z@s", $part)) {
$tokens[$index] = array(
"type" => 'cdata',
"value" => $part,
"locked" => TRUE,
);
// remember parents
if(!empty($nestedTags))
$tokens[$index]["parents"] = $nestedTags;
} elseif(preg_match("@\A$xmlTag\Z@s", $part)) {
$tokens[$index] = array(
"type" => 'xml',
"value" => $part,
"locked" => TRUE,
);
// remember parents
if(!empty($nestedTags))
$tokens[$index]["parents"] = $nestedTags;
} elseif(preg_match("@\A$htmlTagDetail@x", $part, $tagMatch)) {
$tagName = $tagMatch[2];
$selfClose = (isset($tagMatch[3]) && ($tagMatch[3])) ? TRUE : FALSE;
$closing = ($tagMatch[1] || $selfClose) ? TRUE : FALSE;
$tokens[$index] = array(
"type" => 'tag',
"value" => $part,
"name" => $tagName,
"locked" => TRUE,
);
// if tag was block, reset character context for type "text"
$isBlock = FALSE;
foreach($this->blockTags as $blockTag) {
if(strtolower($tokens[$index]["name"]) == strtolower($blockTag)) {
$isBlock = TRUE;
break;
}
}
if($isBlock)
$prevTextIndex = NULL;
if(!$closing) {
// remember parents
if(!empty($nestedTags))
$tokens[$index]["parents"] = $nestedTags;
$attribute = array();
if(preg_match_all("@$attributePattern@x", $part, $attributeMatch)) {
foreach($attributeMatch[1] as $key => $attributeName) {
$attributeValue = $attributeMatch[2][$key].$attributeMatch[3][$key]; // one will be null, the other will contain the desired value
$attribute[$attributeName] = $attributeValue;
}
}
if(!empty($attribute))
$tokens[$index]["attribute"] = $attribute;
//add to $nestedTags
$nestedTags[$index]["tagName"] = $tagName;
if (isset($tokens[$index]["attribute"])) {
$nestedTags[$index]["attributes"] = $tokens[$index]["attribute"];
} else {
$nestedTags[$index]["attributes"] = NULL;
}
} else { // is closing
if($selfClose) {
// remember parents
if(!empty($nestedTags))
$tokens[$index]["parents"] = $nestedTags;
// set openPos and closePos to this index
$tokens[$index]["openPos"] = $index;
$tokens[$index]["closePos"] = $index;
} else {
//remove associated start tag from $nestedTags mark openPos in end tag and closePos in start tag
$matched = FALSE;
$tempNest = $nestedTags;
while(count($nestedTags) > 0) {
$lastTag = end($nestedTags);
$lastTagIndex = key($nestedTags);
unset($nestedTags[$lastTagIndex]);
if($lastTag["tagName"] != $tagName) {
// we have an improperly nested opening tag, close it at it's parent's closing tag
$tokens[$lastTagIndex]["closePos"] = $index;
$tokens[$lastTagIndex]["ERROR"] = "MISSING OR IMPROPERLY NESTED CLOSING TAG";
// if improperly nested tag was block, reset character context for type "text"
$isBlock = FALSE;
foreach($this->blockTags as $blockTag) {
if(strtolower($tokens[$lastTagIndex]["name"]) == strtolower($blockTag)) {
$isBlock = TRUE;
break;
}
}
if($isBlock)
$prevTextIndex = NULL;
} else {
// we have a matching start tag
$tokens[$index]["openPos"] = $lastTagIndex;
$tokens[$lastTagIndex]["closePos"] = $index;
$matched = TRUE;
break;
}
}
if(!$matched) {
// restore $nestedTags
$nestedTags = $tempNest;
// treat unmatched closing tag as self closing
$tokens[$index]["openPos"] = $index;
$tokens[$index]["closePos"] = $index;
$tokens[$lastTagIndex]["ERROR"] = "MISSING OR IMPROPERLY NESTED OPENING TAG";
}
}
}
} else {
$tokens[$index] = array(
"type"=>'text',
"value"=>$part,
"locked" => TRUE,
);
// remember parents
if(!empty($nestedTags))
$tokens[$index]["parents"] = $nestedTags;
// remember character context
if($prevTextIndex != NULL) {
// assign "prevChr"
$tokens[$index]["prevChr"] = mb_substr($tokens[$prevTextIndex]["value"], -1, 1,"UTF-8");
//set "nextChr" of previous text token
$tokens[$prevTextIndex]["nextChr"] = mb_substr($tokens[$index]["value"], 0, 1,"UTF-8");
}
//set $prevTextIndex for next text item
$prevTextIndex = $index;
}
$index++;
}
}
//look for opening tags that never got closed, close at end of file
if(!empty($nestedTags))
foreach($nestedTags as $key => $tagName) {
$tokens[$key]["closePos"] = $index;
$tokens[$key]["ERROR"] = "MISSING CLOSING TAG";
}
$this->html = $tokens;
return TRUE;
}
# Action: reloads $html (i.e. capture new tags inserted in text, or remove those whose values are deleted)
# Returns: TRUE on completion
# WARNING: Tokens acquired through "get" methods may not match new tokenization
function reload() {
return $this->load($this->unload());
}
# Action: outputs HTML as string
# Returns: STRING of HTML markup
function unload() {
$output = "";
foreach($this->html as $token) {
$output .= $token["value"];
}
$this->clear();
return $output;
}
# Params: ARRAY of tokens.
# Action: overwrite "value" for all unlocked matching tokens
# Returns: TRUE on completion
function update($tokens) {
foreach($tokens as $index => $token) {
if(!isset($this->html[$index]["locked"]) || !$this->html[$index]["locked"])
$this->html[$index]["value"] = $token["value"];
}
return TRUE;
}
# Action: unsets $this->html
# Returns: TRUE on completion
function clear() {
$this->html = array();
return TRUE;
}
########################################################################
# LOCK / UNLOCK METHODS
# Action: lock matching tokens
# Returns: TRUE on completion
# Params: ARRAY of tokens.
function lock($tokens) {
foreach($tokens as $index => $token) {
if(isset($this->html[$index]))
$this->html[$index]["locked"] = TRUE;
}
return TRUE;
}
function unlock($tokens) {
foreach($tokens as $index => $token) {
if(isset($this->html[$index]["locked"]))
unset($this->html[$index]["locked"]);
}
return TRUE;
}
function lock_comments() {
return $this->lock_type("comments");
}
function unlock_comments() {
return $this->unlock_type("comments");
}
function lock_dtd() {
return $this->lock_type("dtd");
}
function unlock_dtd() {
return $this->unlock_type("dtd");
}
function lock_cdata() {
return $this->lock_type("cdata");
}
function unlock_cdata() {
return $this->unlock_type("cdata");
}
function lock_xml() {
return $this->lock_type("tag");
}
function unlock_xml() {
return $this->unlock_type("tag");
}
# Params: $tagType INT equal to OPENING_TAGS, CLOSING_TAGS, SELFCLOSING_TAGS, OPENING_AND_SELFCLOSING_TAGS, SELFCLOSING_AND_CLOSING_TAGS, OPENING_AND_CLOSING_TAGS, ALL_TAGS
function lock_tags($tagType = ALL_TAGS) {
$tags = $this->get_type("tag");
if($tagType == OPENING_TAGS) {
$openingTags = array();
foreach($tags as $index => $tag) {
if(!isset($tag["openPos"]) && isset($tag["closePos"])) {
$openingTags[$index] = $tag;
}
}
return $this->lock($openingTags);
}
if($tagType == CLOSING_TAGS) {
$closingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"]) && !isset($tag["closePos"])) {
$closingTags[$index] = $tag;
}
}
return $this->lock($closingTags);
}
if($tagType == SELFCLOSING_TAGS) {
$selfClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"]) && isset($tag["closePos"])) {
$selfClosingTags[$index] = $tag;
}
}
return $this->lock($selfClosingTags);
}
if($tagType == OPENING_AND_SELFCLOSING_TAGS) {
$openingAndSelfClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["closePos"])) {
$openingAndSelfClosingTags[$index] = $tag;
}
}
return $this->lock($openingAndSelfClosingTags);
}
if($tagType == SELFCLOSING_AND_CLOSING_TAGS) {
$selfClosingAndClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"])) {
$selfClosingAndClosingTags[$index] = $tag;
}
}
return $this->lock($selfClosingAndClosingTags);
}
if($tagType == OPENING_AND_CLOSING_TAGS) {
$openingAndClosingTags = array();
foreach($tags as $index => $tag) {
if((!isset($tag["openPos"]) && isset($tag["closePos"])) || (isset($tag["openPos"]) && !isset($tag["closePos"]))) {
$openingAndClosingTags[$index] = $tag;
}
}
return $this->lock($openingAndClosingTags);
}
return $this->lock($tags);
}
# Params: $tagType INT equal to OPENING_TAGS, CLOSING_TAGS, SELFCLOSING_TAGS, OPENING_AND_SELFCLOSING_TAGS, SELFCLOSING_AND_CLOSING_TAGS, OPENING_AND_CLOSING_TAGS, ALL_TAGS
function unlock_tags($tagType = ALL_TAGS) {
$tags = $this->get_type("tag");
if($tagType == OPENING_TAGS) {
$openingTags = array();
foreach($tags as $index => $tag) {
if(!isset($tag["openPos"]) && isset($tag["closePos"])) {
$openingTags[$index] = $tag;
}
}
return $this->unlock($openingTags);
}
if($tagType == CLOSING_TAGS) {
$closingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"]) && !isset($tag["closePos"])) {
$closingTags[$index] = $tag;
}
}
return $this->unlock($closingTags);
}
if($tagType == SELFCLOSING_TAGS) {
$selfClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"]) && isset($tag["closePos"])) {
$selfClosingTags[$index] = $tag;
}
}
return $this->unlock($selfClosingTags);
}
if($tagType == OPENING_AND_SELFCLOSING_TAGS) {
$openingAndSelfClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["closePos"])) {
$openingAndSelfClosingTags[$index] = $tag;
}
}
return $this->unlock($openingAndSelfClosingTags);
}
if($tagType == SELFCLOSING_AND_CLOSING_TAGS) {
$selfClosingAndClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"])) {
$selfClosingAndClosingTags[$index] = $tag;
}
}
return $this->unlock($selfClosingAndClosingTags);
}
if($tagType == OPENING_AND_CLOSING_TAGS) {
$openingAndClosingTags = array();
foreach($tags as $index => $tag) {
if((!isset($tag["openPos"]) && isset($tag["closePos"])) || (isset($tag["openPos"]) && !isset($tag["closePos"]))) {
$openingAndClosingTags[$index] = $tag;
}
}
return $this->unlock($openingAndClosingTags);
}
return $this->unlock($tags);
}
function lock_text() {
return $this->lock_type("text");
}
function unlock_text() {
return $this->unlock_type("text");
}
function lock_children($tokens, $tokenType = ALL_TOKENS) {
foreach($tokens as $index => $token) {
//only process opening tags
if( (!isset($token["openPos"]) || !$token["openPos"]) && ( isset($token["closePos"]) && $token["closePos"]) ) {
$begIndex = $index+1;
$endIndex = $token["closePos"]-1;
if($begIndex > $endIndex) continue;
$childTokens = $this->get_sequential_tokens($begIndex, $endIndex, $tokenType);
//print_r($childTokens);
$this->lock($childTokens);
}
}
return TRUE;
}
function unlock_children($tokens, $tokenType = ALL_TOKENS) {
foreach($tokens as $index => $token) {
//only process opening tags
if( (!isset($token["openPos"]) || !$token["openPos"]) && (isset($token["closePos"]) && $token["closePos"]) ) {
$begIndex = $index+1;
$endIndex = $token["closePos"]-1;
if($begIndex > $endIndex) continue;
$childTokens = $this->get_sequential_tokens($begIndex, $endIndex, $tokenType);
$this->unlock($childTokens);
}
}
return TRUE;
}
########################################################################
# GET METHODS
# Returns: ARRAY of matching tokens
function get_all() {
return $this->html;
}
function get_locked() {
$tokens = array();
foreach($this->html as $index => $token) {
if($token["locked"])
$tokens[$index]=$token;
}
return $tokens;
}
function get_unlocked() {
$tokens = array();
foreach($this->html as $index => $token) {
if(!$token["locked"])
$tokens[$index]=$token;
}
return $tokens;
}
function get_comments() {
return $this->get_type("comments");
}
function get_locked_comments() {
return $this->get_locked_type("comments");
}
function get_unlocked_comments() {
return $this->get_unlocked_type("comments");
}
function get_dtd() {
return $this->get_type("dtd");
}
function get_locked_dtd() {
return $this->get_locked_type("dtd");
}
function get_unlocked_dtd() {
return $this->get_unlocked_type("dtd");
}
function get_cdata() {
return $this->get_type("cdata");
}
function get_locked_cdata() {
return $this->get_locked_type("cdata");
}
function get_unlocked_cdata() {
return $this->get_unlocked_type("cdata");
}
function get_xml() {
return $this->get_type("tag");
}
function get_locked_xml() {
return $this->get_locked_type("tag");
}
function get_unlocked_xml() {
return $this->get_unlocked_type("tag");
}
# Params: $tagType INT equal to OPENING_TAGS, CLOSING_TAGS, SELFCLOSING_TAGS, OPENING_AND_SELFCLOSING_TAGS, SELFCLOSING_AND_CLOSING_TAGS, OPENING_AND_CLOSING_TAGS, ALL_TAGS
function get_tags($tagType = ALL_TAGS) {
$tags = $this->get_type("tag");
if($tagType == OPENING_TAGS) {
$openingTags = array();
foreach($tags as $index => $tag) {
if(!isset($tag["openPos"]) && isset($tag["closePos"])) {
$openingTags[$index] = $tag;
}
}
return $openingTags;
}
if($tagType == CLOSING_TAGS) {
$closingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"]) && !isset($tag["closePos"])) {
$closingTags[$index] = $tag;
}
}
return $closingTags;
}
if($tagType == SELFCLOSING_TAGS) {
$selfClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"]) && isset($tag["closePos"])) {
$selfClosingTags[$index] = $tag;
}
}
return $selfClosingTags;
}
if($tagType == OPENING_AND_SELFCLOSING_TAGS) {
$openingAndSelfClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["closePos"])) {
$openingAndSelfClosingTags[$index] = $tag;
}
}
return $openingAndSelfClosingTags;
}
if($tagType == SELFCLOSING_AND_CLOSING_TAGS) {
$selfClosingAndClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"])) {
$selfClosingAndClosingTags[$index] = $tag;
}
}
return $selfClosingAndClosingTags;
}
if($tagType == OPENING_AND_CLOSING_TAGS) {
$openingAndClosingTags = array();
foreach($tags as $index => $tag) {
if((!isset($tag["openPos"]) && isset($tag["closePos"])) || (isset($tag["openPos"]) && !isset($tag["closePos"]))) {
$openingAndClosingTags[$index] = $tag;
}
}
return $openingAndClosingTags;
}
return $tags;
}
# Params: $tagType INT equal to OPENING_TAGS, CLOSING_TAGS, SELFCLOSING_TAGS, OPENING_AND_SELFCLOSING_TAGS, SELFCLOSING_AND_CLOSING_TAGS, OPENING_AND_CLOSING_TAGS, ALL_TAGS
function get_locked_tags($tagType = ALL_TAGS) {
$tags = $this->get_locked_type("tag");
if($tagType == OPENING_TAGS) {
$openingTags = array();
foreach($tags as $index => $tag) {
if(!isset($tag["openPos"]) && isset($tag["closePos"])) {
$openingTags[$index] = $tag;
}
}
return $openingTags;
}
if($tagType == CLOSING_TAGS) {
$closingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"]) && !isset($tag["closePos"])) {
$closingTags[$index] = $tag;
}
}
return $closingTags;
}
if($tagType == SELFCLOSING_TAGS) {
$selfClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"]) && isset($tag["closePos"])) {
$selfClosingTags[$index] = $tag;
}
}
return $selfClosingTags;
}
if($tagType == OPENING_AND_SELFCLOSING_TAGS) {
$openingAndSelfClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["closePos"])) {
$openingAndSelfClosingTags[$index] = $tag;
}
}
return $openingAndSelfClosingTags;
}
if($tagType == SELFCLOSING_AND_CLOSING_TAGS) {
$selfClosingAndClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"])) {
$selfClosingAndClosingTags[$index] = $tag;
}
}
return $selfClosingAndClosingTags;
}
if($tagType == OPENING_AND_CLOSING_TAGS) {
$openingAndClosingTags = array();
foreach($tags as $index => $tag) {
if((!isset($tag["openPos"]) && isset($tag["closePos"])) || (isset($tag["openPos"]) && !isset($tag["closePos"]))) {
$openingAndClosingTags[$index] = $tag;
}
}
return $openingAndClosingTags;
}
return $tags;
}
# Params: $tagType INT equal to OPENING_TAGS, CLOSING_TAGS, SELFCLOSING_TAGS, OPENING_AND_SELFCLOSING_TAGS, SELFCLOSING_AND_CLOSING_TAGS, OPENING_AND_CLOSING_TAGS, ALL_TAGS
function get_unlocked_tags($tagType = ALL_TAGS) {
$tags = $this->get_unlocked_type("tag");
if($tagType == OPENING_TAGS) {
$openingTags = array();
foreach($tags as $index => $tag) {
if(!isset($tag["openPos"]) && isset($tag["closePos"])) {
$openingTags[$index] = $tag;
}
}
return $openingTags;
}
if($tagType == CLOSING_TAGS) {
$closingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"]) && !isset($tag["closePos"])) {
$closingTags[$index] = $tag;
}
}
return $closingTags;
}
if($tagType == SELFCLOSING_TAGS) {
$selfClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"]) && isset($tag["closePos"])) {
$selfClosingTags[$index] = $tag;
}
}
return $selfClosingTags;
}
if($tagType == OPENING_AND_SELFCLOSING_TAGS) {
$openingAndSelfClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["closePos"])) {
$openingAndSelfClosingTags[$index] = $tag;
}
}
return $openingAndSelfClosingTags;
}
if($tagType == SELFCLOSING_AND_CLOSING_TAGS) {
$selfClosingAndClosingTags = array();
foreach($tags as $index => $tag) {
if(isset($tag["openPos"])) {
$selfClosingAndClosingTags[$index] = $tag;
}
}
return $selfClosingAndClosingTags;
}
if($tagType == OPENING_AND_CLOSING_TAGS) {
$openingAndClosingTags = array();
foreach($tags as $index => $tag) {
if((!isset($tag["openPos"]) && isset($tag["closePos"])) || (isset($tag["openPos"]) && !isset($tag["closePos"]))) {
$openingAndClosingTags[$index] = $tag;
}
}
return $openingAndClosingTags;
}
return $tags;
}
function get_text() {
return $this->get_type("text");
}
function get_locked_text() {
return $this->get_locked_type("text");
}
function get_unlocked_text() {
return $this->get_unlocked_type("text");
}
# Params: $tagNames STRING tag name or ARRAY of tag names
# $tagType INT equal to OPENING_TAGS, CLOSING_TAGS, SELFCLOSING_TAGS, OPENING_AND_SELFCLOSING_TAGS, SELFCLOSING_AND_CLOSING_TAGS, OPENING_AND_CLOSING_TAGS, ALL_TAGS
function get_tags_by_name($tagNames, $tagType = ALL_TAGS) {
if(is_string($tagNames)) $tagNames = array($tagNames);
$tags = $this->get_tags($tagType);
$tagsByName = array();
foreach ($tags as $index => $tag) {
foreach($tagNames as $tagName) {
if($tag["name"] == strtolower($tagName))
$tagsByName[$index] = $tag;
}
}
return $tagsByName;
}
# Params: $idNames STRING id name or ARRAY of id names
function get_tag_by_id($idNames) {
return $this->get_tags_by_attribute('id', $idNames, OPENING_AND_SELFCLOSING_TAGS);
}
# Params: $classNames STRING class name or ARRAY of class names
# $tagType INT equal to OPENING_TAGS, SELFCLOSING_TAGS, OPENING_AND_SELFCLOSING_TAGS
function get_tags_by_class($classNames, $tagType = OPENING_AND_SELFCLOSING_TAGS) {
return $this->get_tags_by_attribute('class', $classNames, $tagType);
}
# Params: $attribute STRING attribute type
# $attributeValue STRING class name or ARRAY of attribute values
# $tagType INT equal to OPENING_TAGS, SELFCLOSING_TAGS, OPENING_AND_SELFCLOSING_TAGS
function get_tags_by_attribute($attribute, $attributeValues, $tagType = OPENING_TAGS) {
if(is_string($attributeValues)) $attributeValues = array($attributeValues);
$tags = $this->get_tags($tagType);
$tagsByAttribute = array();
if(strtolower($attribute) == "id") {
foreach($attributeValues as $attributeValue) {
foreach ($tags as $index => $tag) {
if($tag["attribute"]["id"] == $attributeValue) {
$tagsByAttribute[$index] = $tag;
break;
}
}
}
} elseif(strtolower($attribute) == "class") {
foreach ($tags as $index => $tag) {
if(isset($tag["attribute"]["class"])) {
//because there may be multiple classes
$classList = preg_split('#\s+#', $tag["attribute"]["class"] , -1, PREG_SPLIT_NO_EMPTY);
foreach($classList as $className) {
foreach($attributeValues as $attributeValue) {
if($className == $attributeValue) {
$tagsByAttribute[$index] = $tag;
}
}
}
}
}
} else {
foreach ($tags as $index => $tag) {
if(isset($tags["attribute"][$attribute])) {
foreach($attributeValues as $attributeValue) {
if($tag["attribute"][$attribute] == $attributeValue)
$tagsByAttribute[$index] = $tag;
}
}
}
}
return $tagsByAttribute;
}
# Params: ARRAY of tokens
function get_children($tokens, $tokenType = ALL_TOKENS) {
$results = array();
foreach($tokens as $index => $token) {
//exclude (self)closing tags
if( (isset($token["closePos"]) && $token["closePos"]) && (!isset($token["openPos"]) || !$token["openPos"]) ) {
$begIndex = $index+1;
$endIndex = $token["closePos"]-1;
if($begIndex > $endIndex) continue;
$results += $this->get_sequential_tokens($begIndex, $endIndex, $tokenType); //union avoids dups.
}
}
return $results;
}
########################################################################
# CONDITIONAL METHODS
#
# Returns: TRUE or FALSE depending if condition is met
# Parameter: $tagNames MIXED value(s) of tag name, such as STRING of tag name or ARRAY of tag names
# $token ARRAY token to be evaluated
function in_tag($tagNames, $token) {
if(is_string($tagNames)) $tagNames = array($tagNames);
if(isset($token["parents"])){
foreach ($token["parents"] as $parent) {
if(isset($parent["tagName"])){
foreach($tagNames as $tagName) {
if($parent["tagName"] == $tagName) return TRUE;
}
}
}
}
return FALSE;
}
# Parameters: $attributeName STRING name of attribute, such as "id" or "class"
# $attributeValue MIXED value(s) of attribute, such as STRING of id Name or ARRAY of Class Names
# note: if an ARRAY is passed, method will return TRUE if _any_ of the values match
# $token ARRAY token to be evaluated
function in_attribute($attributeName, $attributeValues, $token) {
if(is_string($attributeValues)) $attributeValues = array($attributeValues);
if(isset($token["parents"])){
foreach ($token["parents"] as $parent) {
if(isset($parent["attributes"][$attributeName])) {
if($attributeName == "class" || $attributeName == "CLASS") {
//because there may be multiple classes
$classList = preg_split('#\s+#', $parent["attributes"][$attributeName] , -1, PREG_SPLIT_NO_EMPTY);
foreach($classList as $className) {
foreach($attributeValues as $attributeValue) {
if($className == $attributeValue) {
return TRUE;
}
}
}
} else {
foreach($attributeValues as $attributeValue) {
if($parent["attributes"][$attributeName] == $attributeValue) {
return TRUE;
}
}
}
}
}
}
return FALSE;
}
# Parameter: $idName MIXED - ARRAY or STRING of id Name(s)
# note: if an ARRAY is passed, method will return TRUE if _any_ of the values match
# $token ARRAY token to be evaluated
function in_id($idName, $token) {
return $this->in_attribute("id", $idName, $token);
}
# Parameter: $className MIXED - ARRAY or STRING of class Name(s)
# note: if an ARRAY is passed, method will return TRUE if _any_ of the values match
# $token ARRAY token to be evaluated
function in_class($className, $token) {
return $this->in_attribute("class", $className, $token);
}
#=======================================================================
#=======================================================================
#== MISC. METHODS
#=======================================================================
#=======================================================================
########################################################################
# LOCK / UNLOCK BY TYPE
# Action: locks / unlocks matching tokens
# Returns: TRUE on completion
# Params: STRING type to lock
function lock_type($type) {
foreach($this->html as $index => &$token) {
if($token["type"] == $type)
$token["locked"] = TRUE;
}
return TRUE;
}
# Params: STRING type to lock
function unlock_type($type) {
foreach($this->html as $index => &$token) {
if($token["type"] == $type)
unset($token["locked"]);
}
return TRUE;
}
########################################################################
# GET METHODS
# Returns: returns matching tokens
#
# Params: STRING type to get
function get_type($type) {
$tokens = array();
foreach($this->html as $index => $token) {
if($token["type"] == $type)
$tokens[$index] = $token;
}
return $tokens;
}
# Params: STRING type to get
function get_unlocked_type($type) {
$tokens = array();
foreach($this->get_type($type) as $index => $token) {
if(!(isset($token["locked"])) || !$token["locked"])
$tokens[$index] = $token;
}
return $tokens;
}
# Params: STRING type to get
function get_locked_type($type) {
$tokens = array();
foreach($this->get_type($type) as $index => $token) {
if($token["locked"])
$tokens[$index] = $token;
}
return $tokens;
}
# Params: STRING beginning index
# STRING ending index
function get_sequential_tokens($begIndex, $endIndex, $tokenType = ALL_TOKENS) {
$tokens = array();
$types = array();
if($tokenType == TEXT_TOKENS) {
$types = array('text');
} elseif($tokenType == TAG_TOKENS) {
$types = array('tag');
} elseif($tokenType == COMMENT_TOKENS) {
$types = array('comment');
} elseif($tokenType == CDATA_TOKENS) {
$types = array('cdata');
} elseif($tokenType == TEXT_AND_TAG_TOKENS) {
$types = array('text','tag');
} elseif($tokenType == TEXT_AND_COMMENT_TOKENS) {
$types = array('text','comment');
} elseif($tokenType == TEXT_AND_CDATA_TOKENS) {
$types = array('text','cdata');
} elseif($tokenType == TAG_AND_COMMENT_TOKENS) {
$types = array('tag','comment');
} elseif($tokenType == TAG_AND_CDATA_TOKENS) {
$types = array('tag','cdata');
} elseif($tokenType == COMMENT_AND_CDATA_TOKENS) {
$types = array('comment','cdata');
} elseif($tokenType == TEXT_TAG_AND_COMMENT_TOKENS) {
$types = array('text','tag','comment');
} elseif($tokenType == TEXT_TAG_AND_CDATA_TOKENS) {
$types = array('text','tag','cdata');
} elseif($tokenType == TEXT_COMMENT_AND_CDATA_TOKENS) {
$types = array('text','comment','cdata');
} elseif($tokenType == TAG_COMMENT_AND_CDATA_TOKENS) {
$types = array('tag','comment','cdata');
} else {
$types = array('text','tag','comment','cdata');
}
if($begIndex > $endIndex){
$temp = $begIndex;
$begIndex = $endIndex;
$endIndex = $temp;
}
for($index = $begIndex; $index<=$endIndex; $index++) {
if(isset($this->html[$index])) {
foreach($types as $type) {
if($type == $this->html[$index]["type"]) {
$tokens[$index] = $this->html[$index];
break;
}
}
}
}
return $tokens;
}
} // end class parseHTML

View file

@ -0,0 +1,511 @@
<?php
/*
Project Name: PHP Parser
URI: http://kingdesk.com/projects/php-parser/
Author: Jeffrey D. King
Author URI: http://kingdesk.com/about/jeff/
Copyright 2009, KINGdesk, LLC. Licensed under the GNU General Public License 2.0. If you use, modify and/or redistribute this software, you must leave the KINGdesk, LLC copyright information, the request for a link to http://kingdesk.com, and the web design services contact information unchanged. If you redistribute this software, or any derivative, it must be released under the GNU General Public License 2.0. This program is distributed without warranty (implied or otherwise) of suitability for any particular purpose. See the GNU General Public License for full license terms <http://creativecommons.org/licenses/GPL/2.0/>.
WE DON'T WANT YOUR MONEY: NO TIPS NECESSARY! If you enjoy this plugin, a link to http://kingdesk.com from your website would be appreciated.
For web design services, please contact info@kingdesk.com.
*/
#########################################################################################################
#########################################################################################################
##
## parseText assumes no HTML markup in text (except for special html characters like &gt;)
##
## if multibyte characters are passed, encoding must be UTF-8
##
#########################################################################################################
#########################################################################################################
class parseText {
var $mb = FALSE; //changes to this must occur prior to load
var $parsedHTML;
var $text = array();
/*
$text structure:
ARRAY:
index => ARRAY: tokenized Text
// REQUIRED
"type" => STRING: "space" | "punctuation" | "word" | "other"
"value" => STRING: token content
"parents" => ARRAY: parent tags: "index" => array("tagName" => tagName, "attributes" => array(name => value, ... ))
// elements must be assigned this value if it has a parent HTML element
*/
#=======================================================================
#=======================================================================
#== METHODS
#=======================================================================
#=======================================================================
########################################################################
# ( UN | RE )LOAD, UPDATE AND CLEAR METHODS
#
# Params: $rawText STRING containing HTML markup OR ARRAY containg a single parseHTML token
# Action: Tokenizes $rawText (or $rawText["value"] - as the case may be) and saves it to $this->text
# Returns: TRUE on completion
function load($rawText) {
$this->clear();
if(is_string($rawText)) {
// not passed a token of class parseHTML so we will fake it
$this->parsedHTML = "";
} elseif(is_array($rawText)) {
// passed an instance of a parseHTML token
$this->parsedHTML = $rawText;
$rawText = $rawText["value"];
} else {
// we have an error
return FALSE;
}
$encodings = array("ASCII","UTF-8", "ISO-8859-1");
$encoding = mb_detect_encoding($rawText."a", $encodings);
if("UTF-8" == $encoding) {
$this->mb = TRUE;
if(!function_exists('mb_strlen')) return FALSE;
} elseif("ASCII" != $encoding) {
return FALSE;
}
$utf8 = ($this->mb) ? "u" : "";
$tokens = array();
# find spacing FIRST (as it is the primary delimiter)
# find the HTML character representation for the following characters:
# tab | line feed | carriage return | space | non-breaking space | ethiopic wordspace
# ogham space mark | en quad space | em quad space | en-space | three-per-em space
# four-per-em space | six-per-em space | figure space | punctuation space | em-space
# thin space | hair space | narrow no-break space
# medium mathematical space | ideographic space
# Some characters are used inside words, we will not count these as a space for the purpose
# of finding word boundaries:
# zero-width-space ("&#8203;", "&#x200b;")
# zero-width-joiner ("&#8204;", "&#x200c;", "&zwj;")
# zero-width-non-joiner ("&#8205;", "&#x200d;", "&zwnj;")
$htmlSpaces = '
(?:
(?: # alpha matches
&
(?: nbsp|ensp|emsp|thinsp )
;
)
|
(?: # decimal matches
&\#
(?: 09|1[03]|32|160|4961|5760|819[2-9]|820[0-2]|8239|8287|12288 )
;
)
|
(?: # hexidecimal matches
&\#x
(?: 000[9ad]|0020|00a0|1361|1680|200[0-9a]|202f|205f|3000 )
;
)
|
(?: # actual characters
\x{0009}|\x{000a}|\x{000d}|\x{0020}|\x{00a0}|\x{1361}|\x{2000}|\x{2001}|\x{2002}|\x{2003}|
\x{2004}|\x{2005}|\x{2006}|\x{2007}|\x{2008}|\x{2009}|\x{200a}|\x{202f}|\x{205f}|\x{3000}
)
)
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8)
$space = "(?:\s|$htmlSpaces)+"; // required modifiers: x (multiline pattern) i (case insensitive) $utf8
# find punctuation and symbols before words (to capture preceeding delimiating characters like hyphens or underscores)
# see http://www.unicode.org/charts/PDF/U2000.pdf
# see http://www.unicode.org/charts/PDF/U2E00.pdf
# find punctuation and symbols
# dec matches = 33-44|46-47|58-60|62-64|91-94|96|123-126|161-172|174-191|215|247|710|732|977-978|982|8211-8231|8240-8286|8289-8292|8352-8399|8448-8527|8592-9215|9632-9983|11776-11903
# hex matches = 0021-002c|002e-002f|003a-003c|003e-0040|005b-e|0060|007b-007e|00a1-00ac|00ae-00bf|00d7|00f7|02c6|02dc|03d1-03d2|
# 03d6|2013-2027|2030-205e|2061-2064|20a0-20cf|2100-214f|2190-23ff|25a0-26ff|2e00-2e7f
#
# Some characters are used inside words, we will not count these as a space for the purpose
# of finding word boundaries:
# hyphens ("&#45;", "&#173;", "&#8208;", "&#8209;", "&#8210;", "&#x002d;", "&#x00ad;", "&#x2010;", "&#x2011;", "&#x2012;", "&shy;")
# underscore ("&#95;", "&#x005f;")
$htmlPunctuation = '
(?:
(?: # alpha matches
&
(?:quot|amp|frasl|lt|gt|iexcl|cent|pound|curren|yen|brvbar|sect|uml|pound|ordf|laquo|not|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|times|divide|circ|tilde|thetasym|upsih|piv|ndash|mdash|lsquo|rsquo|sbquo|ldquo|rdquo|bdquo|dagger|Dagger|bull|hellip|permil|prime|Prime|lsaquo|rsaquo|oline|frasl|euro|trade|alefsym|larr|uarr|rarr|darr|harr|crarr|lArr|uArr|rArr|dArr|hArr|forall|part|exist|emptyn|abla|isin|notin|ni|prod|sum|minus|lowast|radic|prop|infin|ang|and|orc|ap|cup|int|there4|simc|ong|asymp|ne|equiv|le|ge|sub|supn|sub|sube|supe|oplus|otimes|perp|sdot|lceil|rceil|lfloor|rfloor|lang|rang|loz|spades|clubs|hearts|diams)
;
)
|
(?: # decimal matches
&\#
(?: 3[3-9]|4[0-467]|5[89]|6[02-4]|9[1-46]|12[3-6]|16[1-9]|17[0-24-9]|18[0-9]|19[01]|215|247|710|732|97[78]|982|821[1-9]|822[0-9]|823[01]|82[4-7][0-9]|828[0-6]|8289|829[0-2]|835[2-9]|86[6-9][0-9]|844[89]|84[5-9][0-9]|851[0-9]|852[0-7]|859[2-9]|85[6-9][0-9]|8[6-9][0-9][0-9]|9[01][0-9][0-9]|920[0-9]|921[0-5]|963[2-9]|96[4-9][0-9]|9[78][0-9][0-9]|99[0-7][0-9]|998[0-3]|1177[6-9]|117[89][0-9]|118[0-9][0-9]|1190[0-3] )
;
)
|
(?: # hexidecimal matches
&\#x
(?: 002[1-9a-cef]|003[a-cef]|0040|005[b-e]|0060|007[b-e]|00a[1-9a-cef]|00b[0-9a-f]|00d7|00f7|02c6|02dc|03d[126]|201[3-9a-f]|202[0-7]|20[34][0-9a-f]|205[0-9a-e]|206[1-4]|20[a-c][0-9a-f]|21[0-4][0-9a-f]|219[0-9a-f]|2[23][0-9a-f][0-9a-f]|25[a-f][0-9a-f]|23[0-9a-f][0-9a-f]|2e[0-7][0-9a-f] )
;
)
)
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8)
$punctuation = "
(?:
(?:
[^\w\s\&\/\@] # assume characters that are not word spaces or whitespace are punctuation
# exclude & as that is an illegal stand-alone character (and would interfere with HTML character representations
# exclude slash \/as to not include the last slash in a URL
# exclude @ as to keep twitter names together
|
$htmlPunctuation # catch any HTML reps of punctuation
)+
)
";// required modifiers: x (multiline pattern) i (case insensitive) u (utf8)
// duplicated in get_words
// letter connectors allowed in words
# hyphens ("&#45;", "&#173;", "&#8208;", "&#8209;", "&#8210;", "&#x002d;", "&#x00ad;", "&#x2010;", "&#x2011;", "&#x2012;", "&shy;")
# underscore ("&#95;", "&#x005f;")
# zero-width-space ("&#8203;", "&#x200b;")
# zero-width-joiner ("&#8204;", "&#x200c;", "&zwj;")
# zero-width-non-joiner ("&#8205;", "&#x200d;", "&zwnj;")
$htmlLetterConnectors = '
(?:
(?: # alpha matches
&
(?: shy|zwj|zwnj )
;
)
|
(?: # decimal matches
&\#
(?: 45|95|173|820[3-589]|8210 )
;
)
|
(?: # hexidecimal matches
&\#x
(?: 002d|005f|00ad|200[b-d]|201[0-2] )
;
)
|
(?: # actual characters
\x{002d}|\x{005f}|\x{00ad}|\x{200b}|\x{200c}|\x{200d}|\x{2010}|\x{2011}|\x{2012}
)
)
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8)
// word character html entities
// character 0-9__ A-Z__ a-z___ other_special_chrs_____
// decimal 48-57 65-90 97-122 192-214,216-246,248-255, 256-383
// hex 31-39 41-5a 61-7a c0-d6 d8-f6 f8-ff 0100-017f
$htmlLetters = '
(?:
(?: # alpha matches
&
(?:Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml)
;
)
|
(?: # decimal matches
&\#
(?: 4[89]|5[0-7]|9[7-9]|1[01][0-9]|12[0-2]|19[2-9]|20[0-9]|21[0-46-9]|2[23][0-9]|24[0-68-9]|2[5-9][0-9]|3[0-7][0-9]|38[0-3] )
;
)
|
(?: # hexidecimal matches
(?:
&\#x00
(?: 3[1-9]|4[1-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]|c[0-9a-f]|d[0-689]|e[0-9a-f]|f[0-689a-f] )
;
)
|
(?:
&\#x01[0-7][0-9a-f];
)
)
|
(?: # actual characters
[0-9A-Za-z]|\x{00c0}|\x{00c1}|\x{00c2}|\x{00c3}|\x{00c4}|\x{00c5}|\x{00c6}|\x{00c7}|\x{00c8}|\x{00c9}|
\x{00ca}|\x{00cb}|\x{00cc}|\x{00cd}|\x{00ce}|\x{00cf}|\x{00d0}|\x{00d1}|\x{00d2}|\x{00d3}|\x{00d4}|
\x{00d5}|\x{00d6}|\x{00d8}|\x{00d9}|\x{00da}|\x{00db}|\x{00dc}|\x{00dd}|\x{00de}|\x{00df}|\x{00e0}|
\x{00e1}|\x{00e2}|\x{00e3}|\x{00e4}|\x{00e5}|\x{00e6}|\x{00e7}|\x{00e8}|\x{00e9}|\x{00ea}|\x{00eb}|
\x{00ec}|\x{00ed}|\x{00ee}|\x{00ef}|\x{00f0}|\x{00f1}|\x{00f2}|\x{00f3}|\x{00f4}|\x{00f5}|\x{00f6}|
\x{00f8}|\x{00f9}|\x{00fa}|\x{00fb}|\x{00fc}|\x{00fd}|\x{00fe}|\x{00ff}|\x{0100}|\x{0101}|\x{0102}|
\x{0103}|\x{0104}|\x{0105}|\x{0106}|\x{0107}|\x{0108}|\x{0109}|\x{010a}|\x{010b}|\x{010c}|\x{010d}|
\x{010e}|\x{010f}|\x{0110}|\x{0111}|\x{0112}|\x{0113}|\x{0114}|\x{0115}|\x{0116}|\x{0117}|\x{0118}|
\x{0119}|\x{011a}|\x{011b}|\x{011c}|\x{011d}|\x{011e}|\x{011f}|\x{0120}|\x{0121}|\x{0122}|\x{0123}|
\x{0124}|\x{0125}|\x{0126}|\x{0127}|\x{0128}|\x{0129}|\x{012a}|\x{012b}|\x{012c}|\x{012d}|\x{012e}|
\x{012f}|\x{0130}|\x{0131}|\x{0132}|\x{0133}|\x{0134}|\x{0135}|\x{0136}|\x{0137}|\x{0138}|\x{0139}|
\x{013a}|\x{013b}|\x{013c}|\x{013d}|\x{013e}|\x{013f}|\x{0140}|\x{0141}|\x{0142}|\x{0143}|\x{0144}|
\x{0145}|\x{0146}|\x{0147}|\x{0148}|\x{0149}|\x{014a}|\x{014b}|\x{014c}|\x{014d}|\x{014e}|\x{014f}|
\x{0150}|\x{0151}|\x{0152}|\x{0153}|\x{0154}|\x{0155}|\x{0156}|\x{0157}|\x{0158}|\x{0159}|\x{015a}|
\x{015b}|\x{015c}|\x{015d}|\x{015e}|\x{015f}|\x{0160}|\x{0161}|\x{0162}|\x{0163}|\x{0164}|\x{0165}|
\x{0166}|\x{0167}|\x{0168}|\x{0169}|\x{016a}|\x{016b}|\x{016c}|\x{016d}|\x{016e}|\x{016f}|\x{0170}|
\x{0171}|\x{0172}|\x{0173}|\x{0174}|\x{0175}|\x{0176}|\x{0177}|\x{0178}|\x{0179}|\x{017a}|\x{017b}|
\x{017c}|\x{017d}|\x{017e}|\x{017f}
)
)
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8)
$word = "
(?:
(?<![\w\&]) # negative lookbehind to ensure
# 1) we are proceeded by a non-word-character, and
# 2) we are not inside an HTML character def
(?:
[\w\-\_\/]
|
$htmlLetters
|
$htmlLetterConnectors
)+
)
"; // required modifiers: x (multiline pattern) u (utf8)
# find any text
$anyText = "$space|$punctuation|$word"; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8)
$parts = preg_split("/($anyText)/ixu", $rawText, -1, PREG_SPLIT_DELIM_CAPTURE);
$index = 0;
foreach ($parts as $part) {
if ($part != "") {
if(preg_match("/\A$space\Z/xiu", $part)) {
$tokens[$index] = array(
"type" => 'space',
"value" => $part,
);
} elseif(preg_match("/\A$punctuation\Z/sxiu", $part)) {
$tokens[$index] = array(
"type" => 'punctuation',
"value" => $part,
);
} elseif(preg_match("/\A$word\Z/xu", $part)) {
//make sure that things like email addresses and URLs are not broken up into words and punctuation
// not preceeded by an "other"
if($index-1 >= 0 && $tokens[$index-1]['type'] == 'other') {
$oldPart = $tokens[$index-1]['value'];
$tokens[$index-1] = array(
"type" => 'other',
"value" => $oldPart.$part,
);
$index = $index-1;
// not preceeded by a non-space + punctuation
} elseif($index-2 >= 0 && $tokens[$index-1]['type'] == 'punctuation' && $tokens[$index-2]['type'] != 'space') {
$oldPart = $tokens[$index-1]['value'];
$olderPart = $tokens[$index-2]['value'];
$tokens[$index-2] = array(
"type" => 'other',
"value" => $olderPart.$oldPart.$part,
);
unset($tokens[$index-1]);
$index = $index-2;
} else {
$tokens[$index] = array(
"type" => 'word',
"value" => $part,
);
}
} else {
//make sure that things like email addresses and URLs are not broken up into words and punctuation
// not preceeded by an "other" or "word"
if($index-1 >= 0 && ($tokens[$index-1]['type'] == 'word' || $tokens[$index-1]['type'] == 'other')) {
$index = $index-1;
$oldPart = $tokens[$index]['value'];
$tokens[$index] = array(
"type" => 'other',
"value" => $oldPart.$part,
);
// not preceeded by a non-space + punctuation
} elseif($index-2 >= 0 && $tokens[$index-1]['type'] == 'punctuation' && $tokens[$index-2]['type'] != 'space') {
$oldPart = $tokens[$index-1]['value'];
$olderPart = $tokens[$index-2]['value'];
$tokens[$index-2] = array(
"type" => 'other',
"value" => $olderPart.$oldPart.$part,
);
unset($tokens[$index-1]);
$index = $index-2;
} else {
$tokens[$index] = array(
"type" => 'other',
"value" => $part,
);
}
}
if(isset($this->parsedHTML["parents"]))
$tokens[$index]["parents"] = $this->parsedHTML["parents"];
$index++;
}
}
$this->text = $tokens;
return TRUE;
}
# Action: reloads $this->text (i.e. capture new inserted text, or remove those whose values are deleted)
# Returns: TRUE on completion
# WARNING: Tokens previously acquired through "get" methods may not match new tokenization
function reload() {
return $this->load($this->unload());
}
# Action: outputs Text as string
# Returns: STRING of Text (if string was initially loaded), or ARRAY of
function unload() {
$reassembledText = "";
foreach($this->text as $token) {
$reassembledText .= $token["value"];
}
if($this->parsedHTML != "") {
// the initial value loaded was a single token of class parseHTML, so we will return in the same format
$this->parsedHTML["value"] = $reassembledText;
$output = $this->parsedHTML;
} else {
// the initial value loaded was a string, so we will return in the same format
$output = $reassembledText;
}
$this->clear();
return $output;
}
# Action: unsets $this->text
# Returns: TRUE on completion
function clear() {
$this->text = array();
$this->parsedHTML = "";
return TRUE;
}
# Parameter: ARRAY of tokens
# Action: overwrite "value" for all matching tokens
# Returns: TRUE on completion
function update($tokens) {
foreach($tokens as $index => $token) {
$this->text[$index]["value"] = $token["value"];
}
return TRUE;
}
########################################################################
# GET METHODS
#
# Returns: ARRAY of sought tokens
function get_all() {
return $this->text;
}
function get_spaces() {
return $this->get_type("space");
}
function get_punctuation() {
return $this->get_type("punctuation");
}
# Parameter: $abc letter-only match OPTIONAL INT -1=>prohibit, 0=>allow, 1=>require
# $caps capital-only match (allows non letter chrs) OPTIONAL INT -1=>prohibit, 0=>allow, 1=>require
function get_words($abc = 0, $caps = 0) {
$words = $this->get_type("word");
$tokens = array();
//duplicated from load
$htmlLetterConnectors = '
(?:
(?: # alpha matches
&
(?: shy|zwj|zwnj )
;
)
|
(?: # decimal matches
&\#
(?: 45|95|173|820[3-589]|8210 )
;
)
|
(?: # hexidecimal matches
&\#x
(?: 002d|005f|00ad|200[b-d]|201[0-2] )
;
)
|
(?: # actual characters
\x{002d}|\x{005f}|\x{00ad}|\x{200b}|\x{200c}|\x{200d}|\x{2010}|\x{2011}|\x{2012}
)
)
'; // required modifiers: x (multiline pattern) i (case insensitive) u (utf8)
foreach($words as $index => $token) {
if($this->mb) {
$capped = mb_strtoupper($token["value"], "UTF-8");
$lettered = preg_replace("/".$htmlLetterConnectors."|[0-9\-_&#;\/]/ux", "", $token["value"]);
} else {
$capped = strtoupper($token["value"]);
$lettered = preg_replace("/".$htmlLetterConnectors."|[0-9\-_&#;\/]/ux", "", $token["value"]);
}
if( ($abc == -1 && $lettered != $token["value"]) && ($caps == -1 && $capped != $token["value"]) ) $tokens[$index] = $token;
elseif( ($abc == -1 && $lettered != $token["value"]) && $caps == 0 ) $tokens[$index] = $token;
elseif( ($abc == -1 && $lettered != $token["value"]) && ($caps == 1 && $capped == $token["value"]) ) $tokens[$index] = $token;
elseif( $abc == 0 && ($caps == -1 && $capped != $token["value"]) ) $tokens[$index] = $token;
elseif( $abc == 0 && $caps == 0 ) $tokens[$index] = $token;
elseif( $abc == 0 && ($caps == 1 && $capped == $token["value"]) ) $tokens[$index] = $token;
elseif( ($abc == 1 && $lettered == $token["value"]) && ($caps == -1 && $capped != $token["value"]) ) $tokens[$index] = $token;
elseif( ($abc == 1 && $lettered == $token["value"]) && $caps == 0 ) $tokens[$index] = $token;
elseif( ($abc == 1 && $lettered == $token["value"]) && ($caps == 1 && $capped == $token["value"]) ) $tokens[$index] = $token;
}
return $tokens;
}
function get_other() {
return $this->get_type("other");
}
#=======================================================================
#=======================================================================
#== MISC. METHODS
#=======================================================================
#=======================================================================
# Params: STRING type to get
function get_type($type) {
$tokens = array();
foreach($this->text as $index => $token) {
if($token["type"] == $type)
$tokens[$index] = $token;
}
return $tokens;
}
} // end class parseText

View file

@ -0,0 +1,25 @@
<?php
/*
Project Name: PHP Parser
URI: http://kingdesk.com/projects/php-parser/
Author: Jeffrey D. King
Author URI: http://kingdesk.com/about/jeff/
Version: 1.19
Copyright 2009, KINGdesk, LLC. Licensed under the GNU General Public License 2.0. If you use, modify and/or redistribute this software, you must leave the KINGdesk, LLC copyright information, the request for a link to http://kingdesk.com, and the web design services contact information unchanged. If you redistribute this software, or any derivative, it must be released under the GNU General Public License 2.0. This program is distributed without warranty (implied or otherwise) of suitability for any particular purpose. See the GNU General Public License for full license terms <http://creativecommons.org/licenses/GPL/2.0/>.
WE DON'T WANT YOUR MONEY: NO TIPS NECESSARY! If you enjoy this plugin, a link to http://kingdesk.com from your website would be appreciated.
For web design services, please contact info@kingdesk.com.
*/
# two classes defined:
# - parseHTML
# - parseText
#
# PHP Parser has been tested in PHP5. It may work in PHP4, but it has not been tested in that environment
# if you have problems or success in PHP4, please let us know at info@kingdesk.com
require_once('parseHTML.php');
require_once('parseText.php');