diff options
author | Andrew Arnott <andrewarnott@gmail.com> | 2009-09-20 21:18:59 -0700 |
---|---|---|
committer | Andrew Arnott <andrewarnott@gmail.com> | 2009-09-21 08:06:22 -0700 |
commit | bbe3f9cc9c8a1e5909273c1a162a63ea7a66afd8 (patch) | |
tree | c91f66e642c4d26fca266e226b3f2765f546d700 /tools/Sandcastle/Source/DBCSFix/Program.cs | |
parent | 627014f0bbc3fd576277375e70f8391d150b0a67 (diff) | |
download | DotNetOpenAuth-bbe3f9cc9c8a1e5909273c1a162a63ea7a66afd8.zip DotNetOpenAuth-bbe3f9cc9c8a1e5909273c1a162a63ea7a66afd8.tar.gz DotNetOpenAuth-bbe3f9cc9c8a1e5909273c1a162a63ea7a66afd8.tar.bz2 |
Switched out the Sandcastle binaries for the source code.
Diffstat (limited to 'tools/Sandcastle/Source/DBCSFix/Program.cs')
-rw-r--r-- | tools/Sandcastle/Source/DBCSFix/Program.cs | 208 |
1 files changed, 208 insertions, 0 deletions
diff --git a/tools/Sandcastle/Source/DBCSFix/Program.cs b/tools/Sandcastle/Source/DBCSFix/Program.cs new file mode 100644 index 0000000..b463860 --- /dev/null +++ b/tools/Sandcastle/Source/DBCSFix/Program.cs @@ -0,0 +1,208 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// + +using Microsoft.Ddue.Tools.CommandLine; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Text.RegularExpressions; +using System.Text; +using System.Xml; +using System.Xml.XPath; + +namespace DBCSFix +{ + internal class Program + { + public static void Main(string[] args) + { + ConsoleApplication.WriteBanner(); + + // get and validate args + OptionCollection programOptions = new OptionCollection(); + programOptions.Add(new SwitchOption("?", "Show this help page.")); + programOptions.Add(new StringOption("d", @"The directory containing CHM input files (e.g., HHP file). For example, 'C:\DocProject\Output\Chm'. Default is the current directory.")); + programOptions.Add(new StringOption("l", @"The language code ID in decimal. For example, '1033'. Default is '1033' (for EN-US).")); + ParseArgumentsResult options = programOptions.ParseArguments(args); + if (options.Options["?"].IsPresent) + programOptions.WriteOptionSummary(Console.Error); + + // determine the working dir + string chmDirectory; + if (options.Options["d"].IsPresent) + chmDirectory = options.Options["d"].Value.ToString(); + else + chmDirectory = Environment.CurrentDirectory; + + // determine the desired language + string lcid; + if (options.Options["l"].IsPresent) + lcid = options.Options["l"].Value.ToString(); + else + lcid = "1033"; + + // ensure working dir exists + if (!Directory.Exists(chmDirectory)) + { + Console.WriteLine("The specified directory '{0}' doesn't exist. Quitting.", chmDirectory); + return; + } + + // convert unsupported high-order chars to ascii equivalents + substituteAsciiEquivalents(chmDirectory, lcid); + + // no further work required for 1033 + if (String.Equals(lcid, "1033")) + return; + + // convert unsupported chars to named entities + substituteNamedEntities(chmDirectory); + + // convert charset declarations from utf8 to proper ansi codepage value + substituteCodepages(chmDirectory, lcid); + + // convert char encodings from utf8 to ansi + convertUtf8ToAnsi(chmDirectory, lcid); + } + + private static void convertUtf8ToAnsi(string chmDirectory, string lcid) + { + Console.WriteLine("Converting character encodings from utf8 to ansi."); + Encoding ansi = Encoding.GetEncoding(encodingNameForLcid(lcid)); + + List < string > files = new List < string >(); + files.AddRange(Directory.GetFiles(chmDirectory, "*.htm", SearchOption.AllDirectories)); + + foreach (string file in files) + { + using (StreamWriter sw = new StreamWriter(file + ".tmp", false, ansi)) + { + using (StreamReader input = new StreamReader(file)) + { + Encoding sourceEncoding = input.CurrentEncoding; + string line; + while ((line = input.ReadLine()) != null) + { + byte[] sourceBytes = sourceEncoding.GetBytes(line); + byte[] ansiBytes = Encoding.Convert(sourceEncoding, ansi, sourceBytes); + sw.WriteLine(ansi.GetString(ansiBytes)); + } + } + } + + File.Delete(file); + File.Move(file + ".tmp", file); + } + } + + private static string encodingNameForLcid(string lcid) + { + string charset = System.Configuration.ConfigurationSettings.AppSettings[lcid]; + if (String.IsNullOrEmpty(charset)) + return "Windows-1252"; + else + return charset; + } + + private static void substituteAsciiEquivalents(string chmDirectory, string lcid) + { + Console.WriteLine("Converting unsupported high-order characters to 7-bit ASCII equivalents."); + + /* substitution table: + * Char name utf8 (hex) ascii + * Non-breaking space \xC2\xA0 " " (for all languages except Japanese) + * Non-breaking hyphen \xE2\x80\x91 "-" + * En dash \xE2\x80\x93 "-" + * Left curly single quote \xE2\x80\x98 "'" + * Right curly single quote \xE2\x80\x99 "'" + * Left curly double quote \xE2\x80\x9C "\"" + * Right curly double quote \xE2\x80\x9D "\"" + * Horizontal ellipsis U+2026 "..." + */ + + Dictionary < Regex, string > substitutionPatterns = new Dictionary < Regex, string >(); + substitutionPatterns.Add(new Regex(@"\u2018|\u2019", RegexOptions.Compiled), "'"); + substitutionPatterns.Add(new Regex(@"\u201C|\u201D", RegexOptions.Compiled), "\""); + substitutionPatterns.Add(new Regex(@"\u2026", RegexOptions.Compiled), "..."); + if (chmDirectory != "1041") + substitutionPatterns.Add(new Regex(@"\u00A0", RegexOptions.Compiled), " "); + else + substitutionPatterns.Add(new Regex(@"\u00A0", RegexOptions.Compiled), " "); + + string ansi = Encoding.GetEncoding(encodingNameForLcid(lcid)).HeaderName; + Console.WriteLine("EncodingName: " + ansi); + if (!string.Equals(ansi, "Windows-1252")) + { + substitutionPatterns.Add(new Regex(@"\u2011|\u2013", RegexOptions.Compiled), "-"); + substituteInFiles(chmDirectory, "*.htm", substitutionPatterns); + } + else + { + // replace em-dashes with hyphens, if not windows-1252 (e.g., 1033) + substitutionPatterns.Add(new Regex(@"\u2011|\u2013|\u2014", RegexOptions.Compiled), "-"); + } + } + + private static void substituteCodepages(string chmDirectory, string lcid) + { + Console.WriteLine("Inserting charset declarations."); + + Dictionary < Regex, string > substitutionPatterns = new Dictionary < Regex, string >(); + substitutionPatterns.Add(new Regex(@"CHARSET=UTF-8", RegexOptions.Compiled | RegexOptions.IgnoreCase), "CHARSET=" + encodingNameForLcid(lcid)); + + substituteInFiles(chmDirectory, "*.htm", substitutionPatterns); + } + + private static void substituteInFiles(string directory, string fileSpec, ICollection < KeyValuePair < Regex, string > > substitutionPatterns) + { + Debug.Assert(Directory.Exists(directory), "Specified directory doesn't exist."); + Debug.Assert(!String.IsNullOrEmpty(fileSpec), "FileSpec is empty"); + Debug.Assert(substitutionPatterns.Count > 0, "No substitution patterns."); + + string[] files = Directory.GetFiles(directory, fileSpec, SearchOption.AllDirectories); + foreach (string file in files) + { + using (StreamWriter output = new StreamWriter(file + ".tmp", true, Encoding.UTF8)) + { + using (StreamReader input = new StreamReader(file)) + { + string line; + while ((line = input.ReadLine()) != null) + { + foreach (KeyValuePair < Regex, string > pattern in substitutionPatterns) + { + line = pattern.Key.Replace(line, pattern.Value); + } + output.WriteLine(line); + } + } + } + + File.Delete(file); + File.Move(file + ".tmp", file); + } + } + + private static void substituteNamedEntities(string chmDirectory) + { + Console.WriteLine("Converting other unsupported high-order characters to named entities."); + + /* substitution table: + * Char name utf8 (hex) named entity + * Copyright \xC2\xA0 © + * Registered trademark \xC2\xAE ® + * Em dash \xE2\x80\x94 — + * Trademark \xE2\x84\xA2 ™ + */ + + Dictionary < Regex, string > substitutionPatterns = new Dictionary < Regex, string >(); + substitutionPatterns.Add(new Regex(@"\u00A9", RegexOptions.Compiled), "©"); + substitutionPatterns.Add(new Regex(@"\u00AE", RegexOptions.Compiled), "®"); + substitutionPatterns.Add(new Regex(@"\u2014", RegexOptions.Compiled), "—"); + substitutionPatterns.Add(new Regex(@"\u2122", RegexOptions.Compiled), "™"); + + substituteInFiles(chmDirectory, "*.htm", substitutionPatterns); + } + } +} |