diff options
Diffstat (limited to 'unicode_utils/unicode.ml')
-rw-r--r-- | unicode_utils/unicode.ml | 815 |
1 files changed, 815 insertions, 0 deletions
diff --git a/unicode_utils/unicode.ml b/unicode_utils/unicode.ml new file mode 100644 index 0000000..38bfb70 --- /dev/null +++ b/unicode_utils/unicode.ml @@ -0,0 +1,815 @@ + +open Unicode_tables + +exception Invalid + +let fail () = raise Invalid + +let get s i = Char.code (String.unsafe_get s i) +let set s i v = String.unsafe_set s i (Char.unsafe_chr v) + +(****) + +let hangul_sbase = 0xAC00 +let hangul_lbase = 0x1100 +let hangul_vbase = 0x1161 +let hangul_tbase = 0x11A7 + +let hangul_scount = 11172 +let hangul_lcount = 19 +let hangul_vcount = 21 +let hangul_tcount = 28 +let hangul_ncount = hangul_vcount * hangul_tcount + +let set_char_3 s i c = + set s i (c lsr 12 + 0xE0); + set s (i + 1) ((c lsr 6) land 0x3f + 0x80); + set s (i + 2) (c land 0x3f + 0x80) + +let rec norm s i l s' j = + if i < l then begin + let c = get s i in + if c < 0x80 then begin + set s' j (get ascii_lower c); + norm s (i + 1) l s' (j + 1) + end else if c < 0xE0 then begin + (* 80 - 7FF *) + if c < 0xc2 || i + 1 >= l then raise Invalid; + let c1 = get s (i + 1) in + if c1 land 0xc0 <> 0x80 then raise Invalid; + let idx = get norm_prim (c - 0xc0) in + let idx = idx lsl 6 + c1 - 0x80 in + let k = get norm_second_high idx in + if k = 0 then begin + set s' j c; + set s' (j + 1) c1; + norm s (i + 2) l s' (j + 2) + end else begin + let k = (k - 2) lsl 8 + get norm_second_low idx in + let n = get norm_repl k in + String.blit norm_repl (k + 1) s' j n; + norm s (i + 2) l s' (j + n) + end + end else if c < 0xF0 then begin + (* 800 - FFFF *) + if i + 2 >= l then raise Invalid; + let c1 = get s (i + 1) in + if c1 land 0xc0 <> 0x80 then raise Invalid; + let idx = c lsl 6 + c1 - 0x3880 in + if idx < 0x20 then raise Invalid; + let c2 = get s (i + 2) in + if c2 land 0xc0 <> 0x80 then raise Invalid; + let idx = get norm_prim idx in + let idx = idx lsl 6 + c2 - 0x80 in + let k = get norm_second_high idx in + if k = 0 then begin + set s' j c; + set s' (j + 1) c1; + set s' (j + 2) c2; + norm s (i + 3) l s' (j + 3) + end else if k = 1 then begin + let v = c lsl 12 + c1 lsl 6 + c2 - (0x000E2080 + hangul_sbase) in + if v >= hangul_scount then begin + set s' j c; + set s' (j + 1) c1; + set s' (j + 2) c2; + norm s (i + 3) l s' (j + 3) + end else begin + set_char_3 s' j (v / hangul_ncount + hangul_lbase); + set_char_3 s' (j + 3) + ((v mod hangul_ncount) / hangul_tcount + hangul_vbase); + if v mod hangul_tcount = 0 then + norm s (i + 3) l s' (j + 6) + else begin + set_char_3 s' (j + 6) ((v mod hangul_tcount) + hangul_tbase); + norm s (i + 3) l s' (j + 9) + end + end + end else begin + let k = (k - 2) lsl 8 + get norm_second_low idx in + let n = get norm_repl k in + String.blit norm_repl (k + 1) s' j n; + norm s (i + 3) l s' (j + n) + end + end else begin + (* 10000 - 10FFFF *) + if i + 3 >= l then raise Invalid; + let c1 = get s (i + 1) in + let c2 = get s (i + 2) in + let c3 = get s (i + 3) in + if (c1 lor c2 lor c3) land 0xc0 <> 0x80 then raise Invalid; + let v = c lsl 18 + c1 lsl 12 + c2 lsl 6 + c3 - 0x03c82080 in + if v < 0x10000 || v > 0x10ffff then raise Invalid; + set s' j c; + set s' (j + 1) c1; + set s' (j + 2) c2; + set s' (j + 3) c3; + norm s (i + 4) l s' (j + 4) + end + end else + String.sub s' 0 j + +let normalize s = + let l = String.length s in + let s' = String.create (3 * l) in + try norm s 0 l s' 0 with Invalid -> s + +(****) + +let rec compare_rec s s' i l = + if i = l then begin + if l < String.length s then 1 else + if l < String.length s' then -1 else + 0 + end else begin + let c = get s i in + let c' = get s' i in + if c < 0x80 && c' < 0x80 then begin + let v = compare (get ascii_lower c) (get ascii_lower c') in + if v <> 0 then v else compare_rec s s' (i + 1) l + end else + compare (normalize s) (normalize s') + end + +let compare s s' = + compare_rec s s' 0 (min (String.length s) (String.length s')) + +(****) + +let rec decode_char s i l = + if i = l then fail () else + let c = get s i in + if c < 0x80 then + cont s (i + 1) l c + else if c < 0xE0 then begin + (* 80 - 7FF *) + if c < 0xc2 || i + 1 >= l then fail () else + let c1 = get s (i + 1) in + if c1 land 0xc0 <> 0x80 then fail () else + let v = c lsl 6 + c1 - 0x3080 in + cont s (i + 2) l v + end else if c < 0xF0 then begin + (* 800 - FFFF *) + if i + 2 >= l then fail () else + let c1 = get s (i + 1) in + let c2 = get s (i + 2) in + if (c1 lor c2) land 0xc0 <> 0x80 then fail () else + let v = c lsl 12 + c1 lsl 6 + c2 - 0xe2080 in + if v < 0x800 then fail () else + cont s (i + 3) l v + end else begin + (* 10000 - 10FFFF *) + if i + 3 >= l then fail () else + let c1 = get s (i + 1) in + let c2 = get s (i + 2) in + let c3 = get s (i + 3) in + if (c1 lor c2 lor c3) land 0xc0 <> 0x80 then fail () else + let v = c lsl 18 + c1 lsl 12 + c2 lsl 6 + c3 - 0x03c82080 in + if v < 0x10000 || v > 0x10ffff then fail () else + cont s (i + 4) l v + end + +and cont s i l v = (v, i) + +let encode_char s i l c = + if c < 0x80 then begin + if i >= l then fail () else begin + set s i c; + i + 1 + end + end else if c < 0x800 then begin + if i + 1 >= l then fail () else begin + set s i (c lsr 6 + 0xC0); + set s (i + 1) (c land 0x3f + 0x80); + i + 2 + end + end else if c < 0x10000 then begin + if i + 1 >= l then fail () else begin + set s i (c lsr 12 + 0xE0); + set s (i + 1) ((c lsr 6) land 0x3f + 0x80); + set s (i + 2) (c land 0x3f + 0x80); + i + 3 + end + end else begin + if i + 1 >= l then fail () else begin + set s i (c lsr 18 + 0xF0); + set s (i + 1) ((c lsr 12) land 0x3f + 0x80); + set s (i + 2) ((c lsr 6) land 0x3f + 0x80); + set s (i + 3) (c land 0x3f + 0x80); + i + 4 + end + end + +let rec prev_char s i = + let i = i - 1 in + if i < 0 then fail () else + if (get s i) land 0xc0 <> 0x80 then i else prev_char s i + +(****) + +let uniCharPrecompSourceTable = [| + 0x00000300; 0x00540000; 0x00000301; 0x00750054; + 0x00000302; 0x002000C9; 0x00000303; 0x001C00E9; + 0x00000304; 0x002C0105; 0x00000306; 0x00200131; + 0x00000307; 0x002E0151; 0x00000308; 0x0036017F; + 0x00000309; 0x001801B5; 0x0000030A; 0x000601CD; + 0x0000030B; 0x000601D3; 0x0000030C; 0x002501D9; + 0x0000030F; 0x000E01FE; 0x00000311; 0x000C020C; + 0x00000313; 0x000E0218; 0x00000314; 0x00100226; + 0x0000031B; 0x00040236; 0x00000323; 0x002A023A; + 0x00000324; 0x00020264; 0x00000325; 0x00020266; + 0x00000326; 0x00040268; 0x00000327; 0x0016026C; + 0x00000328; 0x000A0282; 0x0000032D; 0x000C028C; + 0x0000032E; 0x00020298; 0x00000330; 0x0006029A; + 0x00000331; 0x001102A0; 0x00000338; 0x002C02B1; + 0x00000342; 0x001D02DD; 0x00000345; 0x003F02FA; + 0x00000653; 0x00010339; 0x00000654; 0x0006033A; + 0x00000655; 0x00010340; 0x0000093C; 0x00030341; + 0x000009BE; 0x00010344; 0x000009D7; 0x00010345; + 0x00000B3E; 0x00010346; 0x00000B56; 0x00010347; + 0x00000B57; 0x00010348; 0x00000BBE; 0x00020349; + 0x00000BD7; 0x0002034B; 0x00000C56; 0x0001034D; + 0x00000CC2; 0x0001034E; 0x00000CD5; 0x0003034F; + 0x00000CD6; 0x00010352; 0x00000D3E; 0x00020353; + 0x00000D57; 0x00010355; 0x00000DCA; 0x00020356; + 0x00000DCF; 0x00010358; 0x00000DDF; 0x00010359; + 0x0000102E; 0x0001035A; 0x00003099; 0x0030035B; + 0x0000309A; 0x000A038B +|] + +let uniCharBMPPrecompDestinationTable = [| + 0x0041; 0x00C0; 0x0045; 0x00C8; 0x0049; 0x00CC; 0x004E; 0x01F8; + 0x004F; 0x00D2; 0x0055; 0x00D9; 0x0057; 0x1E80; 0x0059; 0x1EF2; + 0x0061; 0x00E0; 0x0065; 0x00E8; 0x0069; 0x00EC; 0x006E; 0x01F9; + 0x006F; 0x00F2; 0x0075; 0x00F9; 0x0077; 0x1E81; 0x0079; 0x1EF3; + 0x00A8; 0x1FED; 0x00C2; 0x1EA6; 0x00CA; 0x1EC0; 0x00D4; 0x1ED2; + 0x00DC; 0x01DB; 0x00E2; 0x1EA7; 0x00EA; 0x1EC1; 0x00F4; 0x1ED3; + 0x00FC; 0x01DC; 0x0102; 0x1EB0; 0x0103; 0x1EB1; 0x0112; 0x1E14; + 0x0113; 0x1E15; 0x014C; 0x1E50; 0x014D; 0x1E51; 0x01A0; 0x1EDC; + 0x01A1; 0x1EDD; 0x01AF; 0x1EEA; 0x01B0; 0x1EEB; 0x0391; 0x1FBA; + 0x0395; 0x1FC8; 0x0397; 0x1FCA; 0x0399; 0x1FDA; 0x039F; 0x1FF8; + 0x03A5; 0x1FEA; 0x03A9; 0x1FFA; 0x03B1; 0x1F70; 0x03B5; 0x1F72; + 0x03B7; 0x1F74; 0x03B9; 0x1F76; 0x03BF; 0x1F78; 0x03C5; 0x1F7A; + 0x03C9; 0x1F7C; 0x03CA; 0x1FD2; 0x03CB; 0x1FE2; 0x0415; 0x0400; + 0x0418; 0x040D; 0x0435; 0x0450; 0x0438; 0x045D; 0x1F00; 0x1F02; + 0x1F01; 0x1F03; 0x1F08; 0x1F0A; 0x1F09; 0x1F0B; 0x1F10; 0x1F12; + 0x1F11; 0x1F13; 0x1F18; 0x1F1A; 0x1F19; 0x1F1B; 0x1F20; 0x1F22; + 0x1F21; 0x1F23; 0x1F28; 0x1F2A; 0x1F29; 0x1F2B; 0x1F30; 0x1F32; + 0x1F31; 0x1F33; 0x1F38; 0x1F3A; 0x1F39; 0x1F3B; 0x1F40; 0x1F42; + 0x1F41; 0x1F43; 0x1F48; 0x1F4A; 0x1F49; 0x1F4B; 0x1F50; 0x1F52; + 0x1F51; 0x1F53; 0x1F59; 0x1F5B; 0x1F60; 0x1F62; 0x1F61; 0x1F63; + 0x1F68; 0x1F6A; 0x1F69; 0x1F6B; 0x1FBF; 0x1FCD; 0x1FFE; 0x1FDD; + 0x0041; 0x00C1; 0x0043; 0x0106; 0x0045; 0x00C9; 0x0047; 0x01F4; + 0x0049; 0x00CD; 0x004B; 0x1E30; 0x004C; 0x0139; 0x004D; 0x1E3E; + 0x004E; 0x0143; 0x004F; 0x00D3; 0x0050; 0x1E54; 0x0052; 0x0154; + 0x0053; 0x015A; 0x0055; 0x00DA; 0x0057; 0x1E82; 0x0059; 0x00DD; + 0x005A; 0x0179; 0x0061; 0x00E1; 0x0063; 0x0107; 0x0065; 0x00E9; + 0x0067; 0x01F5; 0x0069; 0x00ED; 0x006B; 0x1E31; 0x006C; 0x013A; + 0x006D; 0x1E3F; 0x006E; 0x0144; 0x006F; 0x00F3; 0x0070; 0x1E55; + 0x0072; 0x0155; 0x0073; 0x015B; 0x0075; 0x00FA; 0x0077; 0x1E83; + 0x0079; 0x00FD; 0x007A; 0x017A; 0x00A8; 0x0385; 0x00C2; 0x1EA4; + 0x00C5; 0x01FA; 0x00C6; 0x01FC; 0x00C7; 0x1E08; 0x00CA; 0x1EBE; + 0x00CF; 0x1E2E; 0x00D4; 0x1ED0; 0x00D5; 0x1E4C; 0x00D8; 0x01FE; + 0x00DC; 0x01D7; 0x00E2; 0x1EA5; 0x00E5; 0x01FB; 0x00E6; 0x01FD; + 0x00E7; 0x1E09; 0x00EA; 0x1EBF; 0x00EF; 0x1E2F; 0x00F4; 0x1ED1; + 0x00F5; 0x1E4D; 0x00F8; 0x01FF; 0x00FC; 0x01D8; 0x0102; 0x1EAE; + 0x0103; 0x1EAF; 0x0112; 0x1E16; 0x0113; 0x1E17; 0x014C; 0x1E52; + 0x014D; 0x1E53; 0x0168; 0x1E78; 0x0169; 0x1E79; 0x01A0; 0x1EDA; + 0x01A1; 0x1EDB; 0x01AF; 0x1EE8; 0x01B0; 0x1EE9; 0x0391; 0x0386; + 0x0395; 0x0388; 0x0397; 0x0389; 0x0399; 0x038A; 0x039F; 0x038C; + 0x03A5; 0x038E; 0x03A9; 0x038F; 0x03B1; 0x03AC; 0x03B5; 0x03AD; + 0x03B7; 0x03AE; 0x03B9; 0x03AF; 0x03BF; 0x03CC; 0x03C5; 0x03CD; + 0x03C9; 0x03CE; 0x03CA; 0x0390; 0x03CB; 0x03B0; 0x03D2; 0x03D3; + 0x0413; 0x0403; 0x041A; 0x040C; 0x0433; 0x0453; 0x043A; 0x045C; + 0x1F00; 0x1F04; 0x1F01; 0x1F05; 0x1F08; 0x1F0C; 0x1F09; 0x1F0D; + 0x1F10; 0x1F14; 0x1F11; 0x1F15; 0x1F18; 0x1F1C; 0x1F19; 0x1F1D; + 0x1F20; 0x1F24; 0x1F21; 0x1F25; 0x1F28; 0x1F2C; 0x1F29; 0x1F2D; + 0x1F30; 0x1F34; 0x1F31; 0x1F35; 0x1F38; 0x1F3C; 0x1F39; 0x1F3D; + 0x1F40; 0x1F44; 0x1F41; 0x1F45; 0x1F48; 0x1F4C; 0x1F49; 0x1F4D; + 0x1F50; 0x1F54; 0x1F51; 0x1F55; 0x1F59; 0x1F5D; 0x1F60; 0x1F64; + 0x1F61; 0x1F65; 0x1F68; 0x1F6C; 0x1F69; 0x1F6D; 0x1FBF; 0x1FCE; + 0x1FFE; 0x1FDE; 0x0041; 0x00C2; 0x0043; 0x0108; 0x0045; 0x00CA; + 0x0047; 0x011C; 0x0048; 0x0124; 0x0049; 0x00CE; 0x004A; 0x0134; + 0x004F; 0x00D4; 0x0053; 0x015C; 0x0055; 0x00DB; 0x0057; 0x0174; + 0x0059; 0x0176; 0x005A; 0x1E90; 0x0061; 0x00E2; 0x0063; 0x0109; + 0x0065; 0x00EA; 0x0067; 0x011D; 0x0068; 0x0125; 0x0069; 0x00EE; + 0x006A; 0x0135; 0x006F; 0x00F4; 0x0073; 0x015D; 0x0075; 0x00FB; + 0x0077; 0x0175; 0x0079; 0x0177; 0x007A; 0x1E91; 0x1EA0; 0x1EAC; + 0x1EA1; 0x1EAD; 0x1EB8; 0x1EC6; 0x1EB9; 0x1EC7; 0x1ECC; 0x1ED8; + 0x1ECD; 0x1ED9; 0x0041; 0x00C3; 0x0045; 0x1EBC; 0x0049; 0x0128; + 0x004E; 0x00D1; 0x004F; 0x00D5; 0x0055; 0x0168; 0x0056; 0x1E7C; + 0x0059; 0x1EF8; 0x0061; 0x00E3; 0x0065; 0x1EBD; 0x0069; 0x0129; + 0x006E; 0x00F1; 0x006F; 0x00F5; 0x0075; 0x0169; 0x0076; 0x1E7D; + 0x0079; 0x1EF9; 0x00C2; 0x1EAA; 0x00CA; 0x1EC4; 0x00D4; 0x1ED6; + 0x00E2; 0x1EAB; 0x00EA; 0x1EC5; 0x00F4; 0x1ED7; 0x0102; 0x1EB4; + 0x0103; 0x1EB5; 0x01A0; 0x1EE0; 0x01A1; 0x1EE1; 0x01AF; 0x1EEE; + 0x01B0; 0x1EEF; 0x0041; 0x0100; 0x0045; 0x0112; 0x0047; 0x1E20; + 0x0049; 0x012A; 0x004F; 0x014C; 0x0055; 0x016A; 0x0059; 0x0232; + 0x0061; 0x0101; 0x0065; 0x0113; 0x0067; 0x1E21; 0x0069; 0x012B; + 0x006F; 0x014D; 0x0075; 0x016B; 0x0079; 0x0233; 0x00C4; 0x01DE; + 0x00C6; 0x01E2; 0x00D5; 0x022C; 0x00D6; 0x022A; 0x00DC; 0x01D5; + 0x00E4; 0x01DF; 0x00E6; 0x01E3; 0x00F5; 0x022D; 0x00F6; 0x022B; + 0x00FC; 0x01D6; 0x01EA; 0x01EC; 0x01EB; 0x01ED; 0x0226; 0x01E0; + 0x0227; 0x01E1; 0x022E; 0x0230; 0x022F; 0x0231; 0x0391; 0x1FB9; + 0x0399; 0x1FD9; 0x03A5; 0x1FE9; 0x03B1; 0x1FB1; 0x03B9; 0x1FD1; + 0x03C5; 0x1FE1; 0x0418; 0x04E2; 0x0423; 0x04EE; 0x0438; 0x04E3; + 0x0443; 0x04EF; 0x1E36; 0x1E38; 0x1E37; 0x1E39; 0x1E5A; 0x1E5C; + 0x1E5B; 0x1E5D; 0x0041; 0x0102; 0x0045; 0x0114; 0x0047; 0x011E; + 0x0049; 0x012C; 0x004F; 0x014E; 0x0055; 0x016C; 0x0061; 0x0103; + 0x0065; 0x0115; 0x0067; 0x011F; 0x0069; 0x012D; 0x006F; 0x014F; + 0x0075; 0x016D; 0x0228; 0x1E1C; 0x0229; 0x1E1D; 0x0391; 0x1FB8; + 0x0399; 0x1FD8; 0x03A5; 0x1FE8; 0x03B1; 0x1FB0; 0x03B9; 0x1FD0; + 0x03C5; 0x1FE0; 0x0410; 0x04D0; 0x0415; 0x04D6; 0x0416; 0x04C1; + 0x0418; 0x0419; 0x0423; 0x040E; 0x0430; 0x04D1; 0x0435; 0x04D7; + 0x0436; 0x04C2; 0x0438; 0x0439; 0x0443; 0x045E; 0x1EA0; 0x1EB6; + 0x1EA1; 0x1EB7; 0x0041; 0x0226; 0x0042; 0x1E02; 0x0043; 0x010A; + 0x0044; 0x1E0A; 0x0045; 0x0116; 0x0046; 0x1E1E; 0x0047; 0x0120; + 0x0048; 0x1E22; 0x0049; 0x0130; 0x004D; 0x1E40; 0x004E; 0x1E44; + 0x004F; 0x022E; 0x0050; 0x1E56; 0x0052; 0x1E58; 0x0053; 0x1E60; + 0x0054; 0x1E6A; 0x0057; 0x1E86; 0x0058; 0x1E8A; 0x0059; 0x1E8E; + 0x005A; 0x017B; 0x0061; 0x0227; 0x0062; 0x1E03; 0x0063; 0x010B; + 0x0064; 0x1E0B; 0x0065; 0x0117; 0x0066; 0x1E1F; 0x0067; 0x0121; + 0x0068; 0x1E23; 0x006D; 0x1E41; 0x006E; 0x1E45; 0x006F; 0x022F; + 0x0070; 0x1E57; 0x0072; 0x1E59; 0x0073; 0x1E61; 0x0074; 0x1E6B; + 0x0077; 0x1E87; 0x0078; 0x1E8B; 0x0079; 0x1E8F; 0x007A; 0x017C; + 0x015A; 0x1E64; 0x015B; 0x1E65; 0x0160; 0x1E66; 0x0161; 0x1E67; + 0x017F; 0x1E9B; 0x1E62; 0x1E68; 0x1E63; 0x1E69; 0x0041; 0x00C4; + 0x0045; 0x00CB; 0x0048; 0x1E26; 0x0049; 0x00CF; 0x004F; 0x00D6; + 0x0055; 0x00DC; 0x0057; 0x1E84; 0x0058; 0x1E8C; 0x0059; 0x0178; + 0x0061; 0x00E4; 0x0065; 0x00EB; 0x0068; 0x1E27; 0x0069; 0x00EF; + 0x006F; 0x00F6; 0x0074; 0x1E97; 0x0075; 0x00FC; 0x0077; 0x1E85; + 0x0078; 0x1E8D; 0x0079; 0x00FF; 0x00D5; 0x1E4E; 0x00F5; 0x1E4F; + 0x016A; 0x1E7A; 0x016B; 0x1E7B; 0x0399; 0x03AA; 0x03A5; 0x03AB; + 0x03B9; 0x03CA; 0x03C5; 0x03CB; 0x03D2; 0x03D4; 0x0406; 0x0407; + 0x0410; 0x04D2; 0x0415; 0x0401; 0x0416; 0x04DC; 0x0417; 0x04DE; + 0x0418; 0x04E4; 0x041E; 0x04E6; 0x0423; 0x04F0; 0x0427; 0x04F4; + 0x042B; 0x04F8; 0x042D; 0x04EC; 0x0430; 0x04D3; 0x0435; 0x0451; + 0x0436; 0x04DD; 0x0437; 0x04DF; 0x0438; 0x04E5; 0x043E; 0x04E7; + 0x0443; 0x04F1; 0x0447; 0x04F5; 0x044B; 0x04F9; 0x044D; 0x04ED; + 0x0456; 0x0457; 0x04D8; 0x04DA; 0x04D9; 0x04DB; 0x04E8; 0x04EA; + 0x04E9; 0x04EB; 0x0041; 0x1EA2; 0x0045; 0x1EBA; 0x0049; 0x1EC8; + 0x004F; 0x1ECE; 0x0055; 0x1EE6; 0x0059; 0x1EF6; 0x0061; 0x1EA3; + 0x0065; 0x1EBB; 0x0069; 0x1EC9; 0x006F; 0x1ECF; 0x0075; 0x1EE7; + 0x0079; 0x1EF7; 0x00C2; 0x1EA8; 0x00CA; 0x1EC2; 0x00D4; 0x1ED4; + 0x00E2; 0x1EA9; 0x00EA; 0x1EC3; 0x00F4; 0x1ED5; 0x0102; 0x1EB2; + 0x0103; 0x1EB3; 0x01A0; 0x1EDE; 0x01A1; 0x1EDF; 0x01AF; 0x1EEC; + 0x01B0; 0x1EED; 0x0041; 0x00C5; 0x0055; 0x016E; 0x0061; 0x00E5; + 0x0075; 0x016F; 0x0077; 0x1E98; 0x0079; 0x1E99; 0x004F; 0x0150; + 0x0055; 0x0170; 0x006F; 0x0151; 0x0075; 0x0171; 0x0423; 0x04F2; + 0x0443; 0x04F3; 0x0041; 0x01CD; 0x0043; 0x010C; 0x0044; 0x010E; + 0x0045; 0x011A; 0x0047; 0x01E6; 0x0048; 0x021E; 0x0049; 0x01CF; + 0x004B; 0x01E8; 0x004C; 0x013D; 0x004E; 0x0147; 0x004F; 0x01D1; + 0x0052; 0x0158; 0x0053; 0x0160; 0x0054; 0x0164; 0x0055; 0x01D3; + 0x005A; 0x017D; 0x0061; 0x01CE; 0x0063; 0x010D; 0x0064; 0x010F; + 0x0065; 0x011B; 0x0067; 0x01E7; 0x0068; 0x021F; 0x0069; 0x01D0; + 0x006A; 0x01F0; 0x006B; 0x01E9; 0x006C; 0x013E; 0x006E; 0x0148; + 0x006F; 0x01D2; 0x0072; 0x0159; 0x0073; 0x0161; 0x0074; 0x0165; + 0x0075; 0x01D4; 0x007A; 0x017E; 0x00DC; 0x01D9; 0x00FC; 0x01DA; + 0x01B7; 0x01EE; 0x0292; 0x01EF; 0x0041; 0x0200; 0x0045; 0x0204; + 0x0049; 0x0208; 0x004F; 0x020C; 0x0052; 0x0210; 0x0055; 0x0214; + 0x0061; 0x0201; 0x0065; 0x0205; 0x0069; 0x0209; 0x006F; 0x020D; + 0x0072; 0x0211; 0x0075; 0x0215; 0x0474; 0x0476; 0x0475; 0x0477; + 0x0041; 0x0202; 0x0045; 0x0206; 0x0049; 0x020A; 0x004F; 0x020E; + 0x0052; 0x0212; 0x0055; 0x0216; 0x0061; 0x0203; 0x0065; 0x0207; + 0x0069; 0x020B; 0x006F; 0x020F; 0x0072; 0x0213; 0x0075; 0x0217; + 0x0391; 0x1F08; 0x0395; 0x1F18; 0x0397; 0x1F28; 0x0399; 0x1F38; + 0x039F; 0x1F48; 0x03A9; 0x1F68; 0x03B1; 0x1F00; 0x03B5; 0x1F10; + 0x03B7; 0x1F20; 0x03B9; 0x1F30; 0x03BF; 0x1F40; 0x03C1; 0x1FE4; + 0x03C5; 0x1F50; 0x03C9; 0x1F60; 0x0391; 0x1F09; 0x0395; 0x1F19; + 0x0397; 0x1F29; 0x0399; 0x1F39; 0x039F; 0x1F49; 0x03A1; 0x1FEC; + 0x03A5; 0x1F59; 0x03A9; 0x1F69; 0x03B1; 0x1F01; 0x03B5; 0x1F11; + 0x03B7; 0x1F21; 0x03B9; 0x1F31; 0x03BF; 0x1F41; 0x03C1; 0x1FE5; + 0x03C5; 0x1F51; 0x03C9; 0x1F61; 0x004F; 0x01A0; 0x0055; 0x01AF; + 0x006F; 0x01A1; 0x0075; 0x01B0; 0x0041; 0x1EA0; 0x0042; 0x1E04; + 0x0044; 0x1E0C; 0x0045; 0x1EB8; 0x0048; 0x1E24; 0x0049; 0x1ECA; + 0x004B; 0x1E32; 0x004C; 0x1E36; 0x004D; 0x1E42; 0x004E; 0x1E46; + 0x004F; 0x1ECC; 0x0052; 0x1E5A; 0x0053; 0x1E62; 0x0054; 0x1E6C; + 0x0055; 0x1EE4; 0x0056; 0x1E7E; 0x0057; 0x1E88; 0x0059; 0x1EF4; + 0x005A; 0x1E92; 0x0061; 0x1EA1; 0x0062; 0x1E05; 0x0064; 0x1E0D; + 0x0065; 0x1EB9; 0x0068; 0x1E25; 0x0069; 0x1ECB; 0x006B; 0x1E33; + 0x006C; 0x1E37; 0x006D; 0x1E43; 0x006E; 0x1E47; 0x006F; 0x1ECD; + 0x0072; 0x1E5B; 0x0073; 0x1E63; 0x0074; 0x1E6D; 0x0075; 0x1EE5; + 0x0076; 0x1E7F; 0x0077; 0x1E89; 0x0079; 0x1EF5; 0x007A; 0x1E93; + 0x01A0; 0x1EE2; 0x01A1; 0x1EE3; 0x01AF; 0x1EF0; 0x01B0; 0x1EF1; + 0x0055; 0x1E72; 0x0075; 0x1E73; 0x0041; 0x1E00; 0x0061; 0x1E01; + 0x0053; 0x0218; 0x0054; 0x021A; 0x0073; 0x0219; 0x0074; 0x021B; + 0x0043; 0x00C7; 0x0044; 0x1E10; 0x0045; 0x0228; 0x0047; 0x0122; + 0x0048; 0x1E28; 0x004B; 0x0136; 0x004C; 0x013B; 0x004E; 0x0145; + 0x0052; 0x0156; 0x0053; 0x015E; 0x0054; 0x0162; 0x0063; 0x00E7; + 0x0064; 0x1E11; 0x0065; 0x0229; 0x0067; 0x0123; 0x0068; 0x1E29; + 0x006B; 0x0137; 0x006C; 0x013C; 0x006E; 0x0146; 0x0072; 0x0157; + 0x0073; 0x015F; 0x0074; 0x0163; 0x0041; 0x0104; 0x0045; 0x0118; + 0x0049; 0x012E; 0x004F; 0x01EA; 0x0055; 0x0172; 0x0061; 0x0105; + 0x0065; 0x0119; 0x0069; 0x012F; 0x006F; 0x01EB; 0x0075; 0x0173; + 0x0044; 0x1E12; 0x0045; 0x1E18; 0x004C; 0x1E3C; 0x004E; 0x1E4A; + 0x0054; 0x1E70; 0x0055; 0x1E76; 0x0064; 0x1E13; 0x0065; 0x1E19; + 0x006C; 0x1E3D; 0x006E; 0x1E4B; 0x0074; 0x1E71; 0x0075; 0x1E77; + 0x0048; 0x1E2A; 0x0068; 0x1E2B; 0x0045; 0x1E1A; 0x0049; 0x1E2C; + 0x0055; 0x1E74; 0x0065; 0x1E1B; 0x0069; 0x1E2D; 0x0075; 0x1E75; + 0x0042; 0x1E06; 0x0044; 0x1E0E; 0x004B; 0x1E34; 0x004C; 0x1E3A; + 0x004E; 0x1E48; 0x0052; 0x1E5E; 0x0054; 0x1E6E; 0x005A; 0x1E94; + 0x0062; 0x1E07; 0x0064; 0x1E0F; 0x0068; 0x1E96; 0x006B; 0x1E35; + 0x006C; 0x1E3B; 0x006E; 0x1E49; 0x0072; 0x1E5F; 0x0074; 0x1E6F; + 0x007A; 0x1E95; 0x003C; 0x226E; 0x003D; 0x2260; 0x003E; 0x226F; + 0x2190; 0x219A; 0x2192; 0x219B; 0x2194; 0x21AE; 0x21D0; 0x21CD; + 0x21D2; 0x21CF; 0x21D4; 0x21CE; 0x2203; 0x2204; 0x2208; 0x2209; + 0x220B; 0x220C; 0x2223; 0x2224; 0x2225; 0x2226; 0x223C; 0x2241; + 0x2243; 0x2244; 0x2245; 0x2247; 0x2248; 0x2249; 0x224D; 0x226D; + 0x2261; 0x2262; 0x2264; 0x2270; 0x2265; 0x2271; 0x2272; 0x2274; + 0x2273; 0x2275; 0x2276; 0x2278; 0x2277; 0x2279; 0x227A; 0x2280; + 0x227B; 0x2281; 0x227C; 0x22E0; 0x227D; 0x22E1; 0x2282; 0x2284; + 0x2283; 0x2285; 0x2286; 0x2288; 0x2287; 0x2289; 0x2291; 0x22E2; + 0x2292; 0x22E3; 0x22A2; 0x22AC; 0x22A8; 0x22AD; 0x22A9; 0x22AE; + 0x22AB; 0x22AF; 0x22B2; 0x22EA; 0x22B3; 0x22EB; 0x22B4; 0x22EC; + 0x22B5; 0x22ED; 0x00A8; 0x1FC1; 0x03B1; 0x1FB6; 0x03B7; 0x1FC6; + 0x03B9; 0x1FD6; 0x03C5; 0x1FE6; 0x03C9; 0x1FF6; 0x03CA; 0x1FD7; + 0x03CB; 0x1FE7; 0x1F00; 0x1F06; 0x1F01; 0x1F07; 0x1F08; 0x1F0E; + 0x1F09; 0x1F0F; 0x1F20; 0x1F26; 0x1F21; 0x1F27; 0x1F28; 0x1F2E; + 0x1F29; 0x1F2F; 0x1F30; 0x1F36; 0x1F31; 0x1F37; 0x1F38; 0x1F3E; + 0x1F39; 0x1F3F; 0x1F50; 0x1F56; 0x1F51; 0x1F57; 0x1F59; 0x1F5F; + 0x1F60; 0x1F66; 0x1F61; 0x1F67; 0x1F68; 0x1F6E; 0x1F69; 0x1F6F; + 0x1FBF; 0x1FCF; 0x1FFE; 0x1FDF; 0x0391; 0x1FBC; 0x0397; 0x1FCC; + 0x03A9; 0x1FFC; 0x03AC; 0x1FB4; 0x03AE; 0x1FC4; 0x03B1; 0x1FB3; + 0x03B7; 0x1FC3; 0x03C9; 0x1FF3; 0x03CE; 0x1FF4; 0x1F00; 0x1F80; + 0x1F01; 0x1F81; 0x1F02; 0x1F82; 0x1F03; 0x1F83; 0x1F04; 0x1F84; + 0x1F05; 0x1F85; 0x1F06; 0x1F86; 0x1F07; 0x1F87; 0x1F08; 0x1F88; + 0x1F09; 0x1F89; 0x1F0A; 0x1F8A; 0x1F0B; 0x1F8B; 0x1F0C; 0x1F8C; + 0x1F0D; 0x1F8D; 0x1F0E; 0x1F8E; 0x1F0F; 0x1F8F; 0x1F20; 0x1F90; + 0x1F21; 0x1F91; 0x1F22; 0x1F92; 0x1F23; 0x1F93; 0x1F24; 0x1F94; + 0x1F25; 0x1F95; 0x1F26; 0x1F96; 0x1F27; 0x1F97; 0x1F28; 0x1F98; + 0x1F29; 0x1F99; 0x1F2A; 0x1F9A; 0x1F2B; 0x1F9B; 0x1F2C; 0x1F9C; + 0x1F2D; 0x1F9D; 0x1F2E; 0x1F9E; 0x1F2F; 0x1F9F; 0x1F60; 0x1FA0; + 0x1F61; 0x1FA1; 0x1F62; 0x1FA2; 0x1F63; 0x1FA3; 0x1F64; 0x1FA4; + 0x1F65; 0x1FA5; 0x1F66; 0x1FA6; 0x1F67; 0x1FA7; 0x1F68; 0x1FA8; + 0x1F69; 0x1FA9; 0x1F6A; 0x1FAA; 0x1F6B; 0x1FAB; 0x1F6C; 0x1FAC; + 0x1F6D; 0x1FAD; 0x1F6E; 0x1FAE; 0x1F6F; 0x1FAF; 0x1F70; 0x1FB2; + 0x1F74; 0x1FC2; 0x1F7C; 0x1FF2; 0x1FB6; 0x1FB7; 0x1FC6; 0x1FC7; + 0x1FF6; 0x1FF7; 0x0627; 0x0622; 0x0627; 0x0623; 0x0648; 0x0624; + 0x064A; 0x0626; 0x06C1; 0x06C2; 0x06D2; 0x06D3; 0x06D5; 0x06C0; + 0x0627; 0x0625; 0x0928; 0x0929; 0x0930; 0x0931; 0x0933; 0x0934; + 0x09C7; 0x09CB; 0x09C7; 0x09CC; 0x0B47; 0x0B4B; 0x0B47; 0x0B48; + 0x0B47; 0x0B4C; 0x0BC6; 0x0BCA; 0x0BC7; 0x0BCB; 0x0B92; 0x0B94; + 0x0BC6; 0x0BCC; 0x0C46; 0x0C48; 0x0CC6; 0x0CCA; 0x0CBF; 0x0CC0; + 0x0CC6; 0x0CC7; 0x0CCA; 0x0CCB; 0x0CC6; 0x0CC8; 0x0D46; 0x0D4A; + 0x0D47; 0x0D4B; 0x0D46; 0x0D4C; 0x0DD9; 0x0DDA; 0x0DDC; 0x0DDD; + 0x0DD9; 0x0DDC; 0x0DD9; 0x0DDE; 0x1025; 0x1026; 0x3046; 0x3094; + 0x304B; 0x304C; 0x304D; 0x304E; 0x304F; 0x3050; 0x3051; 0x3052; + 0x3053; 0x3054; 0x3055; 0x3056; 0x3057; 0x3058; 0x3059; 0x305A; + 0x305B; 0x305C; 0x305D; 0x305E; 0x305F; 0x3060; 0x3061; 0x3062; + 0x3064; 0x3065; 0x3066; 0x3067; 0x3068; 0x3069; 0x306F; 0x3070; + 0x3072; 0x3073; 0x3075; 0x3076; 0x3078; 0x3079; 0x307B; 0x307C; + 0x309D; 0x309E; 0x30A6; 0x30F4; 0x30AB; 0x30AC; 0x30AD; 0x30AE; + 0x30AF; 0x30B0; 0x30B1; 0x30B2; 0x30B3; 0x30B4; 0x30B5; 0x30B6; + 0x30B7; 0x30B8; 0x30B9; 0x30BA; 0x30BB; 0x30BC; 0x30BD; 0x30BE; + 0x30BF; 0x30C0; 0x30C1; 0x30C2; 0x30C4; 0x30C5; 0x30C6; 0x30C7; + 0x30C8; 0x30C9; 0x30CF; 0x30D0; 0x30D2; 0x30D3; 0x30D5; 0x30D6; + 0x30D8; 0x30D9; 0x30DB; 0x30DC; 0x30EF; 0x30F7; 0x30F0; 0x30F8; + 0x30F1; 0x30F9; 0x30F2; 0x30FA; 0x30FD; 0x30FE; 0x306F; 0x3071; + 0x3072; 0x3074; 0x3075; 0x3077; 0x3078; 0x307A; 0x307B; 0x307D; + 0x30CF; 0x30D1; 0x30D2; 0x30D4; 0x30D5; 0x30D7; 0x30D8; 0x30DA; + 0x30DB; 0x30DD +|] + +let uniCharCombiningBitmap = "\ +\x00\x00\x00\x01\x02\x03\x04\x05\ +\x00\x06\x07\x08\x09\x0A\x0B\x0C\ +\x0D\x14\x00\x00\x00\x00\x00\x0E\ +\x0F\x00\x00\x00\x00\x00\x00\x00\ +\x10\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x11\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x12\x00\x00\x13\x00\ +\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\ +\xFF\xFF\x00\x00\xFF\xFF\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x78\x03\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\xFE\xFF\xFB\xFF\xFF\xBB\ +\x16\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\xF8\x3F\x00\x00\x00\x01\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\xC0\xFF\x9F\x3D\x00\x00\ +\x00\x00\x02\x00\x00\x00\xFF\xFF\ +\xFF\x07\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\xC0\xFF\x01\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x0E\x00\x00\x00\x00\x00\x00\xD0\ +\xFF\x3F\x1E\x00\x0C\x00\x00\x00\ +\x0E\x00\x00\x00\x00\x00\x00\xD0\ +\x9F\x39\x80\x00\x0C\x00\x00\x00\ +\x04\x00\x00\x00\x00\x00\x00\xD0\ +\x87\x39\x00\x00\x00\x00\x03\x00\ +\x0E\x00\x00\x00\x00\x00\x00\xD0\ +\xBF\x3B\x00\x00\x00\x00\x00\x00\ +\x0E\x00\x00\x00\x00\x00\x00\xD0\ +\x8F\x39\xC0\x00\x00\x00\x00\x00\ +\x04\x00\x00\x00\x00\x00\x00\xC0\ +\xC7\x3D\x80\x00\x00\x00\x00\x00\ +\x0E\x00\x00\x00\x00\x00\x00\xC0\ +\xDF\x3D\x60\x00\x00\x00\x00\x00\ +\x0C\x00\x00\x00\x00\x00\x00\xC0\ +\xDF\x3D\x60\x00\x00\x00\x00\x00\ +\x0C\x00\x00\x00\x00\x00\x00\xC0\ +\xCF\x3D\x80\x00\x00\x00\x00\x00\ +\x0C\x00\x00\x00\x00\x00\x00\x00\ +\x00\x84\x5F\xFF\x00\x00\x0C\x00\ +\x00\x00\x00\x00\x00\x00\xF2\x07\ +\x80\x7F\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\xF2\x1B\ +\x00\x3F\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x03\x00\x00\xA0\xC2\ +\x00\x00\x00\x00\x00\x00\xFE\xFF\ +\xDF\x00\xFF\xFE\xFF\xFF\xFF\x1F\ +\x40\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\xF0\xC7\x03\ +\x00\x00\xC0\x03\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x1C\x00\x00\x00\x1C\x00\ +\x00\x00\x0C\x00\x00\x00\x0C\x00\ +\x00\x00\x00\x00\x00\x00\xF0\xFF\ +\xFF\xFF\x0F\x00\x00\x00\x00\x00\ +\x00\x38\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x02\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\xFF\xFF\xFF\x07\x00\x00\ +\x00\x00\x00\x00\x00\xFC\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x06\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x40\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\xFF\xFF\x00\x00\x0F\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\x00\x00\x00\x00\ +\x00\x00\x00\x00\xFE\xFF\x3F\x00\ +\x00\x00\x00\x00\x00\xFF\xFF\xFF\ +\x07\x00\x00\x00\x00\x00\x00\x00" + +(****) + +let bitmap_test base bitmap character = + character >= base && character < 0x10000 + && + (let value = get bitmap ((character lsr 8) land 0xFF) in + value = 0xFF + || + (value <> 0 + && + get bitmap ((value - 1) * 32 + 256 + (character land 0xFF) / 8) + land (1 lsl (character land 7)) <> 0)) + +let unicode_combinable character = + bitmap_test 0x0300 uniCharCombiningBitmap character + +let rec find_rec t i j v = + if i + 1 = j then begin + if t.(i * 2) = v then t.(i * 2 + 1) else 0 + end else begin + let k = (i + j) / 2 in + if v < t.(k * 2) then + find_rec t i k v + else + find_rec t k j v + end + +let find t i n v = + let j = i + n in + if v < t.(2 * i) || v > t.(2 * (j - 1)) then 0 else + find_rec t i j v + +let uniCharPrecompSourceTableLen = Array.length uniCharPrecompSourceTable / 2 + +let combine v v' = + if v' >= hangul_vbase && v' < hangul_tbase + hangul_tcount then begin + if + v' < hangul_vbase + hangul_vcount && + v >= hangul_lbase && v < hangul_lbase + hangul_lcount + then + hangul_sbase + ((v - hangul_lbase) * (hangul_vcount * hangul_tcount)) + + ((v' - hangul_vbase) * hangul_tcount) + else if + v' > hangul_tbase && + v >= hangul_sbase && v < hangul_sbase + hangul_scount + then + if (v - hangul_sbase) mod hangul_tcount <> 0 then 0 else + v + v' - hangul_tbase + else + 0 + end else begin + let k = + find uniCharPrecompSourceTable 0 + uniCharPrecompSourceTableLen v' + in + if k = 0 then 0 else + find uniCharBMPPrecompDestinationTable (k land 0xFFFF) (k lsr 16) v + end + +(****) + +let rec scan d s i l = + if i < l then begin + let c = get s i in + if c < 0x80 then + cont d s i l (i + 1) c + else if c < 0xE0 then begin + (* 80 - 7FF *) + if c < 0xc2 || i + 1 >= l then fail () else + let c1 = get s (i + 1) in + if c1 land 0xc0 <> 0x80 then fail () else + let v = c lsl 6 + c1 - 0x3080 in + cont d s i l (i + 2) v + end else if c < 0xF0 then begin + (* 800 - FFFF *) + if i + 2 >= l then fail () else + let c1 = get s (i + 1) in + let c2 = get s (i + 2) in + if (c1 lor c2) land 0xc0 <> 0x80 then fail () else + let v = c lsl 12 + c1 lsl 6 + c2 - 0xe2080 in + if v < 0x800 then fail () else + cont d s i l (i + 3) v + end else begin + (* 10000 - 10FFFF *) + if i + 3 >= l then fail () else + let c1 = get s (i + 1) in + let c2 = get s (i + 2) in + let c3 = get s (i + 3) in + if (c1 lor c2 lor c3) land 0xc0 <> 0x80 then fail () else + let v = c lsl 18 + c1 lsl 12 + c2 lsl 6 + c3 - 0x03c82080 in + if v < 0x10000 || v > 0x10ffff then fail () else + cont d s i l (i + 4) v + end + end else begin + let (i1, i2) = d in + String.blit s i2 s i1 (l - i2); + String.sub s 0 (i1 + l - i2) + end + +and cont d s i l j v' = + if unicode_combinable v' then begin + let i = prev_char s i in + let (v, _) = decode_char s i l in + let v'' = combine v v' in + if v'' = 0 then + scan d s j l + else begin + let (i1, i2) = d in + String.blit s i2 s i1 (i - i2); + let i1 = i1 + i - i2 in + let (v'', i) = compose_rec s j l v'' in + let i1 = encode_char s i1 l v'' in + scan (i1, i) s i l + end + end else + scan d s j l + +and compose_rec s i l v = + try + let (v', j) = decode_char s i l in + if unicode_combinable v' then begin + let v'' = combine v v' in + if v'' = 0 then + (v, i) + else + compose_rec s j l v'' + end else + (v, i) + with Invalid -> + (v, i) + +let compose s = + try scan (0, 0) (String.copy s) 0 (String.length s) with Invalid -> s + +(***) + +let set_2 s i v = + set s i (v land 0xff); + set s (i + 1) (v lsr 8) + +let get_2 s i = (get s (i + 1)) lsl 8 + get s i + +let rec scan s' j s i l = + if i < l then begin + let c = get s i in + if c < 0x80 then + cont s' j s (i + 1) l c + else if c < 0xE0 then begin + (* 80 - 7FF *) + if c < 0xc2 || i + 1 >= l then fail () else + let c1 = get s (i + 1) in + if c1 land 0xc0 <> 0x80 then fail () else + let v = c lsl 6 + c1 - 0x3080 in + cont s' j s (i + 2) l v + end else if c < 0xF0 then begin + (* 800 - FFFF *) + if i + 2 >= l then fail () else + let c1 = get s (i + 1) in + let c2 = get s (i + 2) in + if (c1 lor c2) land 0xc0 <> 0x80 then fail () else + let v = c lsl 12 + c1 lsl 6 + c2 - 0xe2080 in + if v < 0x800 then fail () else + cont s' j s (i + 3) l v + end else begin + (* 10000 - 10FFFF *) + if i + 3 >= l then fail () else + let c1 = get s (i + 1) in + let c2 = get s (i + 2) in + let c3 = get s (i + 3) in + if (c1 lor c2 lor c3) land 0xc0 <> 0x80 then fail () else + let v = c lsl 18 + c1 lsl 12 + c2 lsl 6 + c3 - 0x03c82080 in + if v < 0x10000 || v > 0x10ffff then fail () else + cont s' j s (i + 4) l v + end + end else + String.sub s' 0 j + +and cont s' j s i l v = + if v < 0x10000 then begin + set_2 s' j v; + scan s' (j + 2) s i l + end else begin + let v = v - 0x10000 in + set_2 s' j (v lsr 10 + 0xD800); + set_2 s' (j + 2) (v land 0x3FF + 0xDC00); + scan s' (j + 4) s i l + end + +let to_utf_16 s = + let l = String.length s in + let s' = String.create (2 * l) in + scan s' 0 s 0 l + +(****) + +let rec scan s' i' l' s i l = + if i + 2 <= l then begin + let v = get_2 s i in + if v < 0xD800 || v > 0xDFFF then + let i' = encode_char s' i' l' v in + scan s' i' l' s (i + 2) l + else if v >= 0xdc00 || i + 4 > l then + fail () + else begin + let v' = get_2 s (i + 2) in + if v' < 0xDC00 || v' > 0XDFFF then fail () else + let i' = + encode_char s' i' l' ((v - 0xD800) lsl 10 + (v' - 0xDC00) + 0x10000) + in + scan s' i' l' s (i + 4) l + end + end else if i < l then + fail () + else + String.sub s' 0 i' + + +let from_utf_16 s = + let l = String.length s in + let l' = 3 * l / 2 in + let s' = String.create l' in + scan s' 0 l' s 0 l + +(* +from_utf16 + +2 bytes -> 3 bytes +4 bytes -> 4 bytes +*)
\ No newline at end of file |