summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authortailor <cygnus@janrain.com>2006-01-14 00:11:03 +0000
committertailor <cygnus@janrain.com>2006-01-14 00:11:03 +0000
commit8ee5a5830f2cd3d9f9a15df133d8aebbbc74a562 (patch)
tree54373cfc3dcd6bb5266f43f779a86a8353515b64
parentc3eb2e8863d40dfcc227145879c8b97ccfaaf11a (diff)
downloadphp-openid-8ee5a5830f2cd3d9f9a15df133d8aebbbc74a562.zip
php-openid-8ee5a5830f2cd3d9f9a15df133d8aebbbc74a562.tar.gz
php-openid-8ee5a5830f2cd3d9f9a15df133d8aebbbc74a562.tar.bz2
[project @ Updated parser docs]
-rw-r--r--Net/OpenID/Consumer/Parse.php73
1 files changed, 72 insertions, 1 deletions
diff --git a/Net/OpenID/Consumer/Parse.php b/Net/OpenID/Consumer/Parse.php
index 336a65d..5d63234 100644
--- a/Net/OpenID/Consumer/Parse.php
+++ b/Net/OpenID/Consumer/Parse.php
@@ -3,7 +3,70 @@
/**
* This module implements a VERY limited parser that finds <link> tags
* in the head of HTML or XHTML documents and parses out their
- * attributes according to the OpenID spec.
+ * attributes according to the OpenID spec. It is a liberal parser,
+ * but it requires these things from the data in order to work:
+ *
+ * - There must be an open <html> tag
+ *
+ * - There must be an open <head> tag inside of the <html> tag
+ *
+ * - Only <link>s that are found inside of the <head> tag are parsed
+ * (this is by design)
+ *
+ * - The parser follows the OpenID specification in resolving the
+ * attributes of the link tags. This means that the attributes DO
+ * NOT get resolved as they would by an XML or HTML parser. In
+ * particular, only certain entities get replaced, and href
+ * attributes do not get resolved relative to a base URL.
+ *
+ * From http://openid.net/specs.bml:
+ *
+ * - The openid.server URL MUST be an absolute URL. OpenID consumers
+ * MUST NOT attempt to resolve relative URLs.
+ *
+ * - The openid.server URL MUST NOT include entities other than &amp;,
+ * &lt;, &gt;, and &quot;.
+ *
+ * The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds
+ * of quoting are allowed for attributes.
+ *
+ * The parser deals with invalid markup in these ways:
+ *
+ * - Tag names are not case-sensitive
+ *
+ * - The <html> tag is accepted even when it is not at the top level
+ *
+ * - The <head> tag is accepted even when it is not a direct child of
+ * the <html> tag, but a <html> tag must be an ancestor of the
+ * <head> tag
+ *
+ * - <link> tags are accepted even when they are not direct children
+ * of the <head> tag, but a <head> tag must be an ancestor of the
+ * <link> tag
+ *
+ * - If there is no closing tag for an open <html> or <head> tag, the
+ * remainder of the document is viewed as being inside of the
+ * tag. If there is no closing tag for a <link> tag, the link tag is
+ * treated as a short tag. Exceptions to this rule are that <html>
+ * closes <html> and <body> or <head> closes <head>
+ *
+ * - Attributes of the <link> tag are not required to be quoted.
+ *
+ * - In the case of duplicated attribute names, the attribute coming
+ * last in the tag will be the value returned.
+ *
+ * - Any text that does not parse as an attribute within a link tag
+ * will be ignored. (e.g. <link pumpkin rel='openid.server' /> will
+ * ignore pumpkin)
+ *
+ * - If there are more than one <html> or <head> tag, the parser only
+ * looks inside of the first one.
+ *
+ * - The contents of <script> tags are ignored entirely, except
+ * unclosed <script> tags. Unclosed <script> tags are ignored.
+ *
+ * - Any other invalid markup is ignored, including unclosed SGML
+ * comments and unclosed <![CDATA[blocks.
*
* PHP versions 4 and 5
*
@@ -112,6 +175,14 @@ function Net_OpenID_remove_quotes($str)
}
}
+/**
+ * Find all link tags in a string representing a HTML document and
+ * return a list of their attributes.
+ *
+ * @param string $html The text to parse
+ * @return array $list An array of arrays of attributes, one for each
+ * link tag
+ */
function Net_OpenID_parseLinkAttrs($html)
{