summaryrefslogtreecommitdiffstats
path: root/tools/Sandcastle/Source/DBCSFix/Program.cs
diff options
context:
space:
mode:
authorAndrew Arnott <andrewarnott@gmail.com>2009-09-20 21:18:59 -0700
committerAndrew Arnott <andrewarnott@gmail.com>2009-09-21 08:06:22 -0700
commitbbe3f9cc9c8a1e5909273c1a162a63ea7a66afd8 (patch)
treec91f66e642c4d26fca266e226b3f2765f546d700 /tools/Sandcastle/Source/DBCSFix/Program.cs
parent627014f0bbc3fd576277375e70f8391d150b0a67 (diff)
downloadDotNetOpenAuth-bbe3f9cc9c8a1e5909273c1a162a63ea7a66afd8.zip
DotNetOpenAuth-bbe3f9cc9c8a1e5909273c1a162a63ea7a66afd8.tar.gz
DotNetOpenAuth-bbe3f9cc9c8a1e5909273c1a162a63ea7a66afd8.tar.bz2
Switched out the Sandcastle binaries for the source code.
Diffstat (limited to 'tools/Sandcastle/Source/DBCSFix/Program.cs')
-rw-r--r--tools/Sandcastle/Source/DBCSFix/Program.cs208
1 files changed, 208 insertions, 0 deletions
diff --git a/tools/Sandcastle/Source/DBCSFix/Program.cs b/tools/Sandcastle/Source/DBCSFix/Program.cs
new file mode 100644
index 0000000..b463860
--- /dev/null
+++ b/tools/Sandcastle/Source/DBCSFix/Program.cs
@@ -0,0 +1,208 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+
+using Microsoft.Ddue.Tools.CommandLine;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Text.RegularExpressions;
+using System.Text;
+using System.Xml;
+using System.Xml.XPath;
+
+namespace DBCSFix
+{
+ internal class Program
+ {
+ public static void Main(string[] args)
+ {
+ ConsoleApplication.WriteBanner();
+
+ // get and validate args
+ OptionCollection programOptions = new OptionCollection();
+ programOptions.Add(new SwitchOption("?", "Show this help page."));
+ programOptions.Add(new StringOption("d", @"The directory containing CHM input files (e.g., HHP file). For example, 'C:\DocProject\Output\Chm'. Default is the current directory."));
+ programOptions.Add(new StringOption("l", @"The language code ID in decimal. For example, '1033'. Default is '1033' (for EN-US)."));
+ ParseArgumentsResult options = programOptions.ParseArguments(args);
+ if (options.Options["?"].IsPresent)
+ programOptions.WriteOptionSummary(Console.Error);
+
+ // determine the working dir
+ string chmDirectory;
+ if (options.Options["d"].IsPresent)
+ chmDirectory = options.Options["d"].Value.ToString();
+ else
+ chmDirectory = Environment.CurrentDirectory;
+
+ // determine the desired language
+ string lcid;
+ if (options.Options["l"].IsPresent)
+ lcid = options.Options["l"].Value.ToString();
+ else
+ lcid = "1033";
+
+ // ensure working dir exists
+ if (!Directory.Exists(chmDirectory))
+ {
+ Console.WriteLine("The specified directory '{0}' doesn't exist. Quitting.", chmDirectory);
+ return;
+ }
+
+ // convert unsupported high-order chars to ascii equivalents
+ substituteAsciiEquivalents(chmDirectory, lcid);
+
+ // no further work required for 1033
+ if (String.Equals(lcid, "1033"))
+ return;
+
+ // convert unsupported chars to named entities
+ substituteNamedEntities(chmDirectory);
+
+ // convert charset declarations from utf8 to proper ansi codepage value
+ substituteCodepages(chmDirectory, lcid);
+
+ // convert char encodings from utf8 to ansi
+ convertUtf8ToAnsi(chmDirectory, lcid);
+ }
+
+ private static void convertUtf8ToAnsi(string chmDirectory, string lcid)
+ {
+ Console.WriteLine("Converting character encodings from utf8 to ansi.");
+ Encoding ansi = Encoding.GetEncoding(encodingNameForLcid(lcid));
+
+ List < string > files = new List < string >();
+ files.AddRange(Directory.GetFiles(chmDirectory, "*.htm", SearchOption.AllDirectories));
+
+ foreach (string file in files)
+ {
+ using (StreamWriter sw = new StreamWriter(file + ".tmp", false, ansi))
+ {
+ using (StreamReader input = new StreamReader(file))
+ {
+ Encoding sourceEncoding = input.CurrentEncoding;
+ string line;
+ while ((line = input.ReadLine()) != null)
+ {
+ byte[] sourceBytes = sourceEncoding.GetBytes(line);
+ byte[] ansiBytes = Encoding.Convert(sourceEncoding, ansi, sourceBytes);
+ sw.WriteLine(ansi.GetString(ansiBytes));
+ }
+ }
+ }
+
+ File.Delete(file);
+ File.Move(file + ".tmp", file);
+ }
+ }
+
+ private static string encodingNameForLcid(string lcid)
+ {
+ string charset = System.Configuration.ConfigurationSettings.AppSettings[lcid];
+ if (String.IsNullOrEmpty(charset))
+ return "Windows-1252";
+ else
+ return charset;
+ }
+
+ private static void substituteAsciiEquivalents(string chmDirectory, string lcid)
+ {
+ Console.WriteLine("Converting unsupported high-order characters to 7-bit ASCII equivalents.");
+
+ /* substitution table:
+ * Char name utf8 (hex) ascii
+ * Non-breaking space \xC2\xA0 "&nbsp;" (for all languages except Japanese)
+ * Non-breaking hyphen \xE2\x80\x91 "-"
+ * En dash \xE2\x80\x93 "-"
+ * Left curly single quote \xE2\x80\x98 "'"
+ * Right curly single quote \xE2\x80\x99 "'"
+ * Left curly double quote \xE2\x80\x9C "\""
+ * Right curly double quote \xE2\x80\x9D "\""
+ * Horizontal ellipsis U+2026 "..."
+ */
+
+ Dictionary < Regex, string > substitutionPatterns = new Dictionary < Regex, string >();
+ substitutionPatterns.Add(new Regex(@"\u2018|\u2019", RegexOptions.Compiled), "'");
+ substitutionPatterns.Add(new Regex(@"\u201C|\u201D", RegexOptions.Compiled), "\"");
+ substitutionPatterns.Add(new Regex(@"\u2026", RegexOptions.Compiled), "...");
+ if (chmDirectory != "1041")
+ substitutionPatterns.Add(new Regex(@"\u00A0", RegexOptions.Compiled), "&nbsp;");
+ else
+ substitutionPatterns.Add(new Regex(@"\u00A0", RegexOptions.Compiled), " ");
+
+ string ansi = Encoding.GetEncoding(encodingNameForLcid(lcid)).HeaderName;
+ Console.WriteLine("EncodingName: " + ansi);
+ if (!string.Equals(ansi, "Windows-1252"))
+ {
+ substitutionPatterns.Add(new Regex(@"\u2011|\u2013", RegexOptions.Compiled), "-");
+ substituteInFiles(chmDirectory, "*.htm", substitutionPatterns);
+ }
+ else
+ {
+ // replace em-dashes with hyphens, if not windows-1252 (e.g., 1033)
+ substitutionPatterns.Add(new Regex(@"\u2011|\u2013|\u2014", RegexOptions.Compiled), "-");
+ }
+ }
+
+ private static void substituteCodepages(string chmDirectory, string lcid)
+ {
+ Console.WriteLine("Inserting charset declarations.");
+
+ Dictionary < Regex, string > substitutionPatterns = new Dictionary < Regex, string >();
+ substitutionPatterns.Add(new Regex(@"CHARSET=UTF-8", RegexOptions.Compiled | RegexOptions.IgnoreCase), "CHARSET=" + encodingNameForLcid(lcid));
+
+ substituteInFiles(chmDirectory, "*.htm", substitutionPatterns);
+ }
+
+ private static void substituteInFiles(string directory, string fileSpec, ICollection < KeyValuePair < Regex, string > > substitutionPatterns)
+ {
+ Debug.Assert(Directory.Exists(directory), "Specified directory doesn't exist.");
+ Debug.Assert(!String.IsNullOrEmpty(fileSpec), "FileSpec is empty");
+ Debug.Assert(substitutionPatterns.Count > 0, "No substitution patterns.");
+
+ string[] files = Directory.GetFiles(directory, fileSpec, SearchOption.AllDirectories);
+ foreach (string file in files)
+ {
+ using (StreamWriter output = new StreamWriter(file + ".tmp", true, Encoding.UTF8))
+ {
+ using (StreamReader input = new StreamReader(file))
+ {
+ string line;
+ while ((line = input.ReadLine()) != null)
+ {
+ foreach (KeyValuePair < Regex, string > pattern in substitutionPatterns)
+ {
+ line = pattern.Key.Replace(line, pattern.Value);
+ }
+ output.WriteLine(line);
+ }
+ }
+ }
+
+ File.Delete(file);
+ File.Move(file + ".tmp", file);
+ }
+ }
+
+ private static void substituteNamedEntities(string chmDirectory)
+ {
+ Console.WriteLine("Converting other unsupported high-order characters to named entities.");
+
+ /* substitution table:
+ * Char name utf8 (hex) named entity
+ * Copyright \xC2\xA0 &copy
+ * Registered trademark \xC2\xAE &reg
+ * Em dash \xE2\x80\x94 &mdash;
+ * Trademark \xE2\x84\xA2 &trade;
+ */
+
+ Dictionary < Regex, string > substitutionPatterns = new Dictionary < Regex, string >();
+ substitutionPatterns.Add(new Regex(@"\u00A9", RegexOptions.Compiled), "&copy;");
+ substitutionPatterns.Add(new Regex(@"\u00AE", RegexOptions.Compiled), "&reg;");
+ substitutionPatterns.Add(new Regex(@"\u2014", RegexOptions.Compiled), "&mdash;");
+ substitutionPatterns.Add(new Regex(@"\u2122", RegexOptions.Compiled), "&trade;");
+
+ substituteInFiles(chmDirectory, "*.htm", substitutionPatterns);
+ }
+ }
+}