1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
|
// Copyright © Microsoft Corporation.
// This source file is subject to the Microsoft Permissive License.
// See http://www.microsoft.com/resources/sharedsource/licensingbasics/sharedsourcelicenses.mspx.
// All other rights reserved.
using Microsoft.Ddue.Tools.CommandLine;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Text.RegularExpressions;
using System.Text;
using System.Xml;
using System.Xml.XPath;
namespace DBCSFix
{
internal class Program
{
public static void Main(string[] args)
{
ConsoleApplication.WriteBanner();
// get and validate args
OptionCollection programOptions = new OptionCollection();
programOptions.Add(new SwitchOption("?", "Show this help page."));
programOptions.Add(new StringOption("d", @"The directory containing CHM input files (e.g., HHP file). For example, 'C:\DocProject\Output\Chm'. Default is the current directory."));
programOptions.Add(new StringOption("l", @"The language code ID in decimal. For example, '1033'. Default is '1033' (for EN-US)."));
ParseArgumentsResult options = programOptions.ParseArguments(args);
if (options.Options["?"].IsPresent)
programOptions.WriteOptionSummary(Console.Error);
// determine the working dir
string chmDirectory;
if (options.Options["d"].IsPresent)
chmDirectory = options.Options["d"].Value.ToString();
else
chmDirectory = Environment.CurrentDirectory;
// determine the desired language
string lcid;
if (options.Options["l"].IsPresent)
lcid = options.Options["l"].Value.ToString();
else
lcid = "1033";
// ensure working dir exists
if (!Directory.Exists(chmDirectory))
{
Console.WriteLine("The specified directory '{0}' doesn't exist. Quitting.", chmDirectory);
return;
}
// convert unsupported high-order chars to ascii equivalents
substituteAsciiEquivalents(chmDirectory, lcid);
// no further work required for 1033
if (String.Equals(lcid, "1033"))
return;
// convert unsupported chars to named entities
substituteNamedEntities(chmDirectory);
// convert charset declarations from utf8 to proper ansi codepage value
substituteCodepages(chmDirectory, lcid);
// convert char encodings from utf8 to ansi
convertUtf8ToAnsi(chmDirectory, lcid);
}
private static void convertUtf8ToAnsi(string chmDirectory, string lcid)
{
Console.WriteLine("Converting character encodings from utf8 to ansi.");
Encoding ansi = Encoding.GetEncoding(encodingNameForLcid(lcid));
List < string > files = new List < string >();
files.AddRange(Directory.GetFiles(chmDirectory, "*.htm", SearchOption.AllDirectories));
foreach (string file in files)
{
using (StreamWriter sw = new StreamWriter(file + ".tmp", false, ansi))
{
using (StreamReader input = new StreamReader(file))
{
Encoding sourceEncoding = input.CurrentEncoding;
string line;
while ((line = input.ReadLine()) != null)
{
byte[] sourceBytes = sourceEncoding.GetBytes(line);
byte[] ansiBytes = Encoding.Convert(sourceEncoding, ansi, sourceBytes);
sw.WriteLine(ansi.GetString(ansiBytes));
}
}
}
File.Delete(file);
File.Move(file + ".tmp", file);
}
}
private static string encodingNameForLcid(string lcid)
{
string charset = System.Configuration.ConfigurationSettings.AppSettings[lcid];
if (String.IsNullOrEmpty(charset))
return "Windows-1252";
else
return charset;
}
private static void substituteAsciiEquivalents(string chmDirectory, string lcid)
{
Console.WriteLine("Converting unsupported high-order characters to 7-bit ASCII equivalents.");
/* substitution table:
* Char name utf8 (hex) ascii
* Non-breaking space \xC2\xA0 " " (for all languages except Japanese)
* Non-breaking hyphen \xE2\x80\x91 "-"
* En dash \xE2\x80\x93 "-"
* Left curly single quote \xE2\x80\x98 "'"
* Right curly single quote \xE2\x80\x99 "'"
* Left curly double quote \xE2\x80\x9C "\""
* Right curly double quote \xE2\x80\x9D "\""
* Horizontal ellipsis U+2026 "..."
*/
Dictionary < Regex, string > substitutionPatterns = new Dictionary < Regex, string >();
substitutionPatterns.Add(new Regex(@"\u2018|\u2019", RegexOptions.Compiled), "'");
substitutionPatterns.Add(new Regex(@"\u201C|\u201D", RegexOptions.Compiled), "\"");
substitutionPatterns.Add(new Regex(@"\u2026", RegexOptions.Compiled), "...");
if (chmDirectory != "1041")
substitutionPatterns.Add(new Regex(@"\u00A0", RegexOptions.Compiled), " ");
else
substitutionPatterns.Add(new Regex(@"\u00A0", RegexOptions.Compiled), " ");
string ansi = Encoding.GetEncoding(encodingNameForLcid(lcid)).HeaderName;
Console.WriteLine("EncodingName: " + ansi);
if (!string.Equals(ansi, "Windows-1252"))
{
substitutionPatterns.Add(new Regex(@"\u2011|\u2013", RegexOptions.Compiled), "-");
substituteInFiles(chmDirectory, "*.htm", substitutionPatterns);
}
else
{
// replace em-dashes with hyphens, if not windows-1252 (e.g., 1033)
substitutionPatterns.Add(new Regex(@"\u2011|\u2013|\u2014", RegexOptions.Compiled), "-");
}
}
private static void substituteCodepages(string chmDirectory, string lcid)
{
Console.WriteLine("Inserting charset declarations.");
Dictionary < Regex, string > substitutionPatterns = new Dictionary < Regex, string >();
substitutionPatterns.Add(new Regex(@"CHARSET=UTF-8", RegexOptions.Compiled | RegexOptions.IgnoreCase), "CHARSET=" + encodingNameForLcid(lcid));
substituteInFiles(chmDirectory, "*.htm", substitutionPatterns);
}
private static void substituteInFiles(string directory, string fileSpec, ICollection < KeyValuePair < Regex, string > > substitutionPatterns)
{
Debug.Assert(Directory.Exists(directory), "Specified directory doesn't exist.");
Debug.Assert(!String.IsNullOrEmpty(fileSpec), "FileSpec is empty");
Debug.Assert(substitutionPatterns.Count > 0, "No substitution patterns.");
string[] files = Directory.GetFiles(directory, fileSpec, SearchOption.AllDirectories);
foreach (string file in files)
{
using (StreamWriter output = new StreamWriter(file + ".tmp", true, Encoding.UTF8))
{
using (StreamReader input = new StreamReader(file))
{
string line;
while ((line = input.ReadLine()) != null)
{
foreach (KeyValuePair < Regex, string > pattern in substitutionPatterns)
{
line = pattern.Key.Replace(line, pattern.Value);
}
output.WriteLine(line);
}
}
}
File.Delete(file);
File.Move(file + ".tmp", file);
}
}
private static void substituteNamedEntities(string chmDirectory)
{
Console.WriteLine("Converting other unsupported high-order characters to named entities.");
/* substitution table:
* Char name utf8 (hex) named entity
* Copyright \xC2\xA0 ©
* Registered trademark \xC2\xAE ®
* Em dash \xE2\x80\x94 —
* Trademark \xE2\x84\xA2 ™
*/
Dictionary < Regex, string > substitutionPatterns = new Dictionary < Regex, string >();
substitutionPatterns.Add(new Regex(@"\u00A9", RegexOptions.Compiled), "©");
substitutionPatterns.Add(new Regex(@"\u00AE", RegexOptions.Compiled), "®");
substitutionPatterns.Add(new Regex(@"\u2014", RegexOptions.Compiled), "—");
substitutionPatterns.Add(new Regex(@"\u2122", RegexOptions.Compiled), "™");
substituteInFiles(chmDirectory, "*.htm", substitutionPatterns);
}
}
}
|