diff options
author | tailor <atrus@atrus.org> | 2007-02-16 10:18:49 +0000 |
---|---|---|
committer | tailor <atrus@atrus.org> | 2007-02-16 10:18:49 +0000 |
commit | 017706efb0b42da71656b59ff322471b4b85f828 (patch) | |
tree | fabb0bed7994e9c4654e8e6e16664bae738c5949 | |
parent | 7448aa955cb1015afc43d8a6563ba61a511db592 (diff) | |
download | php-openid-017706efb0b42da71656b59ff322471b4b85f828.zip php-openid-017706efb0b42da71656b59ff322471b4b85f828.tar.gz php-openid-017706efb0b42da71656b59ff322471b4b85f828.tar.bz2 |
[project @ Improve HTML parser effeciency and tolerence]
-rw-r--r-- | Services/Yadis/ParseHTML.php | 142 |
1 files changed, 58 insertions, 84 deletions
diff --git a/Services/Yadis/ParseHTML.php b/Services/Yadis/ParseHTML.php index fc0df17..2906645 100644 --- a/Services/Yadis/ParseHTML.php +++ b/Services/Yadis/ParseHTML.php @@ -30,33 +30,15 @@ class Services_Yadis_ParseHTML { /** * @access private */ - var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*?)(?:<\/?%s\s*>|\Z))"; + var $_tag_expr = "<%s%s(?:\s.*?)?%s>"; /** * @access private */ - var $_close_tag_expr = "<\/?%s\s*>"; - - /** - * @access private - */ - var $_removed_re = - "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>"; - - /** - * @access private - */ - var $_attr_find = '\b([-\w]+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)'; + var $_attr_find = '\b([-\w]+)=(".*?"|\'.*?\'|.+?)[\s>]'; function Services_Yadis_ParseHTML() { - $this->_meta_find = sprintf("/<meta\b(?!:)([^>]*)(?!<)>/%s", - $this->_re_flags); - - $this->_removed_re = sprintf("/%s/%s", - $this->_removed_re, - $this->_re_flags); - $this->_attr_find = sprintf("/%s/%s", $this->_attr_find, $this->_re_flags); @@ -121,46 +103,37 @@ class Services_Yadis_ParseHTML { } /** - * Create a regular expression that will match an opening (and - * optional) closing tag of a given name. + * Create a regular expression that will match an opening + * or closing tag from a set of names. * * @access private - * @param string $tag_name The tag name to match - * @param array $close_tags An array of tag names which also - * constitute closing of the original tag + * @param mixed $tag_names Tag names to match + * @param mixed $close false/0 = no, true/1 = yes, other = maybe + * @param mixed $self_close false/0 = no, true/1 = yes, other = maybe * @return string $regex A regular expression string to be used * in, say, preg_match. */ - function tagMatcher($tag_name, $close_tags = null) + function tagPattern($tag_names, $close, $self_close) { - if ($close_tags) { - $options = implode("|", array_merge(array($tag_name), $close_tags)); - $closer = sprintf("(?:%s)", $options); + if (is_array($tag_names)) { + $tag_names = '(?:'.implode('|',$tag_names).')'; + } + if ($close) { + $close = '\/' . (($close == 1)? '' : '?'); } else { - $closer = $tag_name; + $close = ''; } + if ($self_close) { + $self_close = '(?:\/\s*)' . (($self_close == 1)? '' : '?'); + } else { + $self_close = ''; + } + $expr = sprintf($this->_tag_expr, $close, $tag_names, $self_close); - $expr = sprintf($this->_tag_expr, $tag_name, $closer); return sprintf("/%s/%s", $expr, $this->_re_flags); } /** - * @access private - */ - function htmlFind($str) - { - return $this->tagMatcher('html', array('body')); - } - - /** - * @access private - */ - function headFind() - { - return $this->tagMatcher('head', array('body')); - } - - /** * Given an HTML document string, this finds all the META tags in * the document, provided they are found in the * <HTML><HEAD>...</HEAD> section of the document. The <HTML> tag @@ -173,51 +146,52 @@ class Services_Yadis_ParseHTML { */ function getMetaTags($html_string) { - $stripped = preg_replace($this->_removed_re, - "", - $html_string); - - // Look for the closing body tag. - $body_closer = sprintf($this->_close_tag_expr, 'body'); - $body_matches = array(); - preg_match($body_closer, $html_string, $body_matches, - PREG_OFFSET_CAPTURE); - if ($body_matches) { - $html_string = substr($html_string, 0, $body_matches[0][1]); + $key_tags = array($this->tagPattern('html', false, false), + $this->tagPattern('head', false, false), + $this->tagPattern('head', true, false), + $this->tagPattern('html', true, false), + $this->tagPattern(array( + 'body', 'frameset', 'frame', 'p', 'div', + 'table','span','a'), 'maybe', 'maybe')); + $key_tags_pos = array(); + foreach ($key_tags as $pat) { + $matches = array(); + preg_match($pat, $html_string, $matches, PREG_OFFSET_CAPTURE); + if($matches) { + $key_tags_pos[] = $matches[0][1]; + } else { + $key_tags_pos[] = null; + } } - - // Look for the opening body tag, and discard everything after - // that tag. - $body_re = $this->tagMatcher('body'); - $body_matches = array(); - preg_match($body_re, $html_string, $body_matches, PREG_OFFSET_CAPTURE); - if ($body_matches) { - $html_string = substr($html_string, 0, $body_matches[0][1]); + // no opening head tag + if (is_null($key_tags_pos[1])) { + return array(); } - - // If an HTML tag is found at all, it must be in the right - // order; else, it may be missing (which is a case we allow - // for). - $html_re = $this->tagMatcher('html', array('body')); - preg_match($html_re, $html_string, $html_matches); - if ($html_matches) { - $html = $html_matches[0]; - } else { - $html = $html_string; + // the effective </head> is the min of the following + if (is_null($key_tags_pos[2])) { + $key_tags_pos[2] = strlen($html_string); } - - // Try to find the <HEAD> tag. - $head_re = $this->headFind(); - $head_matches = array(); - if (!preg_match($head_re, $html, $head_matches)) { + foreach (array($key_tags_pos[3], $key_tags_pos[4]) as $pos) { + if (!is_null($pos) && $pos < $key_tags_pos[2]) { + $key_tags_pos[2] = $pos; + } + } + // closing head tag comes before opening head tag + if ($key_tags_pos[1] > $key_tags_pos[2]) { + return array(); + } + // if there is an opening html tag, make sure the opening head tag + // comes after it + if (!is_null($key_tags_pos[0]) && $key_tags_pos[1] < $key_tags_pos[0]) { return array(); } + $html_string = substr($html_string, $key_tags_pos[1], ($key_tags_pos[2]-$key_tags_pos[1])); $link_data = array(); $link_matches = array(); - - if (!preg_match_all($this->_meta_find, $head_matches[0], - $link_matches)) { + + if (!preg_match_all($this->tagPattern('meta', false, 'maybe'), + $html_string, $link_matches)) { return array(); } |