//----------------------------------------------------------------------- // // Copyright (c) Outercurve Foundation, Scott Hanselman, Jason Alexander. All rights reserved. // //----------------------------------------------------------------------- namespace DotNetOpenAuth.Yadis { using System; using System.Collections.Generic; using System.Globalization; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Web; using System.Web.UI.HtmlControls; using Validation; /// /// An HTML HEAD tag parser. /// internal static class HtmlParser { /// /// Common flags to use on regex tests. /// private const RegexOptions Flags = RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled | RegexOptions.IgnoreCase; /// /// A regular expression designed to select tags (?) /// private const string TagExpr = "\n# Starts with the tag name at a word boundary, where the tag name is\n# not a namespace\n<{0}\\b(?!:)\n \n# All of the stuff up to a \">\", hopefully attributes.\n(?[^>]*?)\n \n(?: # Match a short tag\n />\n \n| # Match a full tag\n >\n \n (?.*?)\n \n # Closed by\n (?: # One of the specified close tags\n \n \n # End of the string\n | \\Z\n \n )\n \n)\n "; /// /// A regular expression designed to select start tags (?) /// private const string StartTagExpr = "\n# Starts with the tag name at a word boundary, where the tag name is\n# not a namespace\n<{0}\\b(?!:)\n \n# All of the stuff up to a \">\", hopefully attributes.\n(?[^>]*?)\n \n(?: # Match a short tag\n />\n \n| # Match a full tag\n >\n )\n "; /// /// A regular expression designed to select attributes within a tag. /// private static readonly Regex attrRe = new Regex("\n# Must start with a sequence of word-characters, followed by an equals sign\n(?(\\w|-)+)=\n\n# Then either a quoted or unquoted attribute\n(?:\n\n # Match everything that's between matching quote marks\n (?[\"\\'])(?.*?)\\k\n|\n\n # If the value is not quoted, match up to whitespace\n (?(?:[^\\s<>/]|/(?!>))+)\n)\n\n|\n\n(?[<>])\n ", Flags); /// /// A regular expression designed to select the HEAD tag. /// private static readonly Regex headRe = TagMatcher("head", new[] { "body" }); /// /// A regular expression designed to select the HTML tag. /// private static readonly Regex htmlRe = TagMatcher("html", new string[0]); /// /// A regular expression designed to remove all comments and scripts from a string. /// private static readonly Regex removedRe = new Regex(@"||]*>.*?", Flags); /// /// Finds all the HTML HEAD tag child elements that match the tag name of a given type. /// /// The HTML tag of interest. /// The HTML to scan. /// A sequence of the matching elements. public static IEnumerable HeadTags(string html) where T : HtmlControl, new() { html = removedRe.Replace(html, string.Empty); Match match = htmlRe.Match(html); string tagName = (new T()).TagName; if (match.Success) { Match match2 = headRe.Match(html, match.Index, match.Length); if (match2.Success) { string text = null; string text2 = null; Regex regex = StartTagMatcher(tagName); for (Match match3 = regex.Match(html, match2.Index, match2.Length); match3.Success; match3 = match3.NextMatch()) { int beginning = (match3.Index + tagName.Length) + 1; int length = (match3.Index + match3.Length) - beginning; Match match4 = attrRe.Match(html, beginning, length); var headTag = new T(); while (match4.Success) { if (match4.Groups["endtag"].Success) { break; } text = match4.Groups["attrname"].Value; text2 = HttpUtility.HtmlDecode(match4.Groups["attrval"].Value); headTag.Attributes.Add(text, text2); match4 = match4.NextMatch(); } yield return headTag; } } } } /// /// Filters a list of controls based on presence of an attribute. /// /// The type of HTML controls being filtered. /// The sequence. /// The attribute. /// A filtered sequence of attributes. internal static IEnumerable WithAttribute(this IEnumerable sequence, string attribute) where T : HtmlControl { Requires.NotNull(sequence, "sequence"); Requires.NotNullOrEmpty(attribute, "attribute"); return sequence.Where(tag => tag.Attributes[attribute] != null); } /// /// Generates a regular expression that will find a given HTML tag. /// /// Name of the tag. /// The close tags (?). /// The created regular expression. private static Regex TagMatcher(string tagName, params string[] closeTags) { string text2; if (closeTags.Length > 0) { StringBuilder builder = new StringBuilder(); builder.AppendFormat("(?:{0}", tagName); int index = 0; string[] textArray = closeTags; int length = textArray.Length; while (index < length) { string text = textArray[index]; index++; builder.AppendFormat("|{0}", text); } builder.Append(")"); text2 = builder.ToString(); } else { text2 = tagName; } return new Regex(string.Format(CultureInfo.InvariantCulture, TagExpr, tagName, text2), Flags); } /// /// Generates a regular expression designed to find a given tag. /// /// The tag to find. /// The created regular expression. private static Regex StartTagMatcher(string tagName) { return new Regex(string.Format(CultureInfo.InvariantCulture, StartTagExpr, tagName), Flags); } } }