summaryrefslogtreecommitdiffstats
path: root/src/DotNetOpenAuth.OpenId/Yadis/HtmlParser.cs
diff options
context:
space:
mode:
Diffstat (limited to 'src/DotNetOpenAuth.OpenId/Yadis/HtmlParser.cs')
-rw-r--r--src/DotNetOpenAuth.OpenId/Yadis/HtmlParser.cs142
1 files changed, 142 insertions, 0 deletions
diff --git a/src/DotNetOpenAuth.OpenId/Yadis/HtmlParser.cs b/src/DotNetOpenAuth.OpenId/Yadis/HtmlParser.cs
new file mode 100644
index 0000000..a6b64c1
--- /dev/null
+++ b/src/DotNetOpenAuth.OpenId/Yadis/HtmlParser.cs
@@ -0,0 +1,142 @@
+//-----------------------------------------------------------------------
+// <copyright file="HtmlParser.cs" company="Andrew Arnott, Scott Hanselman, Jason Alexander">
+// Copyright (c) Andrew Arnott, Scott Hanselman, Jason Alexander. All rights reserved.
+// </copyright>
+//-----------------------------------------------------------------------
+
+namespace DotNetOpenAuth.Yadis {
+ using System;
+ using System.Collections.Generic;
+ using System.Diagnostics.Contracts;
+ using System.Globalization;
+ using System.Linq;
+ using System.Text;
+ using System.Text.RegularExpressions;
+ using System.Web;
+ using System.Web.UI.HtmlControls;
+
+ /// <summary>
+ /// An HTML HEAD tag parser.
+ /// </summary>
+ internal static class HtmlParser {
+ /// <summary>
+ /// Common flags to use on regex tests.
+ /// </summary>
+ private const RegexOptions Flags = RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled | RegexOptions.IgnoreCase;
+
+ /// <summary>
+ /// A regular expression designed to select tags (?)
+ /// </summary>
+ private const string TagExpr = "\n# Starts with the tag name at a word boundary, where the tag name is\n# not a namespace\n<{0}\\b(?!:)\n \n# All of the stuff up to a \">\", hopefully attributes.\n(?<attrs>[^>]*?)\n \n(?: # Match a short tag\n />\n \n| # Match a full tag\n >\n \n (?<contents>.*?)\n \n # Closed by\n (?: # One of the specified close tags\n </?{1}\\s*>\n \n # End of the string\n | \\Z\n \n )\n \n)\n ";
+
+ /// <summary>
+ /// A regular expression designed to select start tags (?)
+ /// </summary>
+ private const string StartTagExpr = "\n# Starts with the tag name at a word boundary, where the tag name is\n# not a namespace\n<{0}\\b(?!:)\n \n# All of the stuff up to a \">\", hopefully attributes.\n(?<attrs>[^>]*?)\n \n(?: # Match a short tag\n />\n \n| # Match a full tag\n >\n )\n ";
+
+ /// <summary>
+ /// A regular expression designed to select attributes within a tag.
+ /// </summary>
+ private static readonly Regex attrRe = new Regex("\n# Must start with a sequence of word-characters, followed by an equals sign\n(?<attrname>(\\w|-)+)=\n\n# Then either a quoted or unquoted attribute\n(?:\n\n # Match everything that's between matching quote marks\n (?<qopen>[\"\\'])(?<attrval>.*?)\\k<qopen>\n|\n\n # If the value is not quoted, match up to whitespace\n (?<attrval>(?:[^\\s<>/]|/(?!>))+)\n)\n\n|\n\n(?<endtag>[<>])\n ", Flags);
+
+ /// <summary>
+ /// A regular expression designed to select the HEAD tag.
+ /// </summary>
+ private static readonly Regex headRe = TagMatcher("head", new[] { "body" });
+
+ /// <summary>
+ /// A regular expression designed to select the HTML tag.
+ /// </summary>
+ private static readonly Regex htmlRe = TagMatcher("html", new string[0]);
+
+ /// <summary>
+ /// A regular expression designed to remove all comments and scripts from a string.
+ /// </summary>
+ private static readonly Regex removedRe = new Regex(@"<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b[^>]*>.*?</script>", Flags);
+
+ /// <summary>
+ /// Finds all the HTML HEAD tag child elements that match the tag name of a given type.
+ /// </summary>
+ /// <typeparam name="T">The HTML tag of interest.</typeparam>
+ /// <param name="html">The HTML to scan.</param>
+ /// <returns>A sequence of the matching elements.</returns>
+ public static IEnumerable<T> HeadTags<T>(string html) where T : HtmlControl, new() {
+ html = removedRe.Replace(html, string.Empty);
+ Match match = htmlRe.Match(html);
+ string tagName = (new T()).TagName;
+ if (match.Success) {
+ Match match2 = headRe.Match(html, match.Index, match.Length);
+ if (match2.Success) {
+ string text = null;
+ string text2 = null;
+ Regex regex = StartTagMatcher(tagName);
+ for (Match match3 = regex.Match(html, match2.Index, match2.Length); match3.Success; match3 = match3.NextMatch()) {
+ int beginning = (match3.Index + tagName.Length) + 1;
+ int length = (match3.Index + match3.Length) - beginning;
+ Match match4 = attrRe.Match(html, beginning, length);
+ var headTag = new T();
+ while (match4.Success) {
+ if (match4.Groups["endtag"].Success) {
+ break;
+ }
+ text = match4.Groups["attrname"].Value;
+ text2 = HttpUtility.HtmlDecode(match4.Groups["attrval"].Value);
+ headTag.Attributes.Add(text, text2);
+ match4 = match4.NextMatch();
+ }
+ yield return headTag;
+ }
+ }
+ }
+ }
+
+ /// <summary>
+ /// Filters a list of controls based on presence of an attribute.
+ /// </summary>
+ /// <typeparam name="T">The type of HTML controls being filtered.</typeparam>
+ /// <param name="sequence">The sequence.</param>
+ /// <param name="attribute">The attribute.</param>
+ /// <returns>A filtered sequence of attributes.</returns>
+ internal static IEnumerable<T> WithAttribute<T>(this IEnumerable<T> sequence, string attribute) where T : HtmlControl {
+ Contract.Requires<ArgumentNullException>(sequence != null);
+ Contract.Requires<ArgumentException>(!String.IsNullOrEmpty(attribute));
+ return sequence.Where(tag => tag.Attributes[attribute] != null);
+ }
+
+ /// <summary>
+ /// Generates a regular expression that will find a given HTML tag.
+ /// </summary>
+ /// <param name="tagName">Name of the tag.</param>
+ /// <param name="closeTags">The close tags (?).</param>
+ /// <returns>The created regular expression.</returns>
+ private static Regex TagMatcher(string tagName, params string[] closeTags) {
+ string text2;
+ if (closeTags.Length > 0) {
+ StringBuilder builder = new StringBuilder();
+ builder.AppendFormat("(?:{0}", tagName);
+ int index = 0;
+ string[] textArray = closeTags;
+ int length = textArray.Length;
+ while (index < length) {
+ string text = textArray[index];
+ index++;
+ builder.AppendFormat("|{0}", text);
+ }
+ builder.Append(")");
+ text2 = builder.ToString();
+ } else {
+ text2 = tagName;
+ }
+ return new Regex(string.Format(CultureInfo.InvariantCulture, TagExpr, tagName, text2), Flags);
+ }
+
+ /// <summary>
+ /// Generates a regular expression designed to find a given tag.
+ /// </summary>
+ /// <param name="tagName">The tag to find.</param>
+ /// <returns>The created regular expression.</returns>
+ private static Regex StartTagMatcher(string tagName) {
+ return new Regex(string.Format(CultureInfo.InvariantCulture, StartTagExpr, tagName), Flags);
+ }
+ }
+}