[prev in list] [next in list] [prev in thread] [next in thread]
List: mono-patches
Subject: [Mono-patches] r45251 -
From: "Atsushi Enomoto (ginga () kit ! hi-ho ! ne ! jp)" <atsushi () mono-cvs ! ximian ! com>
Date: 2005-05-31 11:06:13
Message-ID: 20050531110613.5E53394765 () mono-cvs ! ximian ! com
[Download RAW message or body]
Author: atsushi
Date: 2005-05-31 07:06:13 -0400 (Tue, 31 May 2005)
New Revision: 45251
Modified:
branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs
Log:
2005-05-31 Atsushi Enomoto <atsushi@ximian.com>
* Collation-notes.txt : wrong comment cleanup and spelling fixes.
* create-mscompat-collation-table.cs : added diacritic support for
Latin letters (as long as covered in primary weight).
Modified: branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
===================================================================
--- branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog 2005-05-31 \
10:59:45 UTC (rev 45250)
+++ branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog 2005-05-31 \
11:06:13 UTC (rev 45251) @@ -1,5 +1,11 @@
2005-05-31 Atsushi Enomoto <atsushi@ximian.com>
+ * Collation-notes.txt : wrong comment cleanup and spelling fixes.
+ * create-mscompat-collation-table.cs : added diacritic support for
+ Latin letters (as long as covered in primary weight).
+
+2005-05-31 Atsushi Enomoto <atsushi@ximian.com>
+
* Makefile : minor fixes. Added warning lines to generated sources.
2005-05-31 Atsushi Enomoto <atsushi@ximian.com>
Modified: branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
===================================================================
--- branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt 2005-05-31 \
10:59:45 UTC (rev 45250)
+++ branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt 2005-05-31 \
11:06:13 UTC (rev 45251) @@ -213,15 +213,6 @@
**** level 2
- <del>
- For Japanese voice marks, it just sums the count up.
-
- There also seems special rule for Thai (E01-E4F) e.g. E47 works like
- Japanese voice marks.
-
- For other letters, there will be a table.
- </del>
-
It looks like all level 2 keys are just accumulated, however without
considering overflow. It sometimes makes sense (e.g. diaeresis and
acute) but it causes many conflicts (e.g. "A\u0308\u0301" and "\u1EA6"
@@ -251,11 +242,9 @@
- 4 Alef/Bet/Gimel/Dalet (2135-2138)
- 8 Final form in presentation form B in FE82-FEF2
- 18 Medial form in presentation form B in FE8C-FEF4
-
- Grep "ISOLATED", "FINAL" or "MEDIAL" against UnicodeData.txt
- (and filter by codepoints).
- or alternatively, see DerivedDecompositionType.txt.
-
+ Grep "ISOLATED", "FINAL" or "MEDIAL" on UnicodeData.txt
+ (and filter by codepoints).
+ or alternatively, see DerivedDecompositionType.txt.
- 22 6A9 (TODO: what is it?)
- 28 6AA (TODO: what is it?)
@@ -423,7 +412,7 @@
-10: dot above
-11: middle dot
-12: circumflex
- -13: diaresis
+ -13: diaeresis
-14: caron
Note that 1C4-1C6 are covered but they are also expanded.
-15: breve (cyrillic are also covered? at least 4C1/4C2 are.)
@@ -439,19 +428,19 @@
(i.e. they not one-to-one mapping. Neither that every
"stroke" are mapped to 1E, nor not every 1E are mapped to
"stroke".)
- -1F: diaresis and acute | with circumflex and grave | l slash
+ -1F: diaeresis and acute | with circumflex and grave | l slash
beware "symbol slash"
- -20: diaresis and grave | 19B,19F
+ -20: diaeresis and grave | 19B,19F
-21: breve and acute | D8,F8
-22: caron and dot above | breve and grave
-23: macron and acute
-24: macron and grave
- -25: diaresis and caron | dot above and macron | tilde and acute
+ -25: diaeresis and caron | dot above and macron | tilde and acute
-26: ring above and acute
- -28: diaresis and macron | cedilla and acute |
- macron and diaresis
+ -28: diaeresis and macron | cedilla and acute |
+ macron and diaeresis
-29: circumflex and tilde
- -2A: tilde and diaresis
+ -2A: tilde and diaeresis
-2B: stroke and acute
-2C: breve and tilde
-2F: cedilla and breve
@@ -466,7 +455,7 @@
-55: line below | circumflex and hook above
-57: palatal hook (actually only 1AB)
-58: dot below except for cp{1EA0,1EA1}
- -59: "retroflex" (without "WITH") | diaresis below | 1EA0,1EA1
+ -59: "retroflex" (without "WITH") | diaeresis below | 1EA0,1EA1
-5A: ring below | 1E76,1E77
-60: circumflex below except for cp{1E76,1E77} | horn and acute
-61: breve below | horn and grave
Modified: branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs
===================================================================
--- branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs 2005-05-31 \
10:59:45 UTC (rev 45250)
+++ branches/atsushi/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs 2005-05-31 \
11:06:13 UTC (rev 45251) @@ -84,16 +84,77 @@
bool [] isSmallCapital = new bool [char.MaxValue + 1];
bool [] isUppercase = new bool [char.MaxValue + 1];
+
byte [] decompType = new byte [char.MaxValue + 1];
int [] decompIndex = new int [char.MaxValue + 1];
int [] decompLength = new int [char.MaxValue + 1];
int [] decompValues;
decimal [] decimalValue = new decimal [char.MaxValue + 1];
+ byte [] diacritical = new byte [char.MaxValue + 1];
+
+ string [] diacritics = new string [] {
+ " ACUTE;", " GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
+ " CIRCUMFLEX;", " DIAERESIS;", " CARON;", " BREVE;",
+ " DIALYTIKA AND TONOS;", " MACRON;", " TILDE;", " RING ABOVE;",
+ " OGONEK;", " CEDILLA;",
+ " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
+ " STROKE;", " CIRCUMFLEX AND ACUTE;",
+ " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
+ " DIAERESIS AND GRAVE;",
+ " BREVE AND ACUTE;",
+ " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
+ " MACRON AND ACUTE;",
+ " MACRON AND GRAVE;",
+ " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
+ " RING ABOVE AND ACUTE",
+ " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
+ " CIRCUMFLEX AND TILDE",
+ " TILDE AND DIAERESIS",
+ " STROKE AND ACUTE",
+ " BREVE AND TILDE",
+ " CEDILLA AND BREVE",
+ " OGONEK AND MACRON",
+ " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
+ " DOUBLE GRAVE;",
+ " INVERTED BREVE",
+ " PRECEDED BY APOSTROPHE",
+ " HORN;",
+ " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
+ " PALATAL HOOK",
+ " DOT BELOW;",
+ " RETROFLEX;", "DIAERESIS BELOW",
+ " RING BELOW",
+ " CIRCUMFLEX BELOW", "HORN AND ACUTE",
+ " BREVE BELOW;", " HORN AND GRAVE",
+ " TILDE BELOW",
+ " DOT BELOW AND DOT ABOVE",
+ " RIGHT HALF RING", " HORN AND TILDE",
+ " CIRCUMFLEX AND DOT BELOW",
+ " BREVE AND DOT BELOW",
+ " DOT BELOW AND MACRON",
+ " HORN AND HOOK ABOVE",
+ " HORN AND DOT",
+ };
+ byte [] diacriticWeights = new byte [] {
+ 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
+ 0x17, 0x19, 0x1A, 0x1B, 0x1C,
+ 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
+ 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
+ 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
+ 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
+ 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
+ 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
+ 0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
+ 0x69, 0x69, 0x6A, 0x6D, 0x6E,
+ 0x95, 0xAA
+ };
+
char [] orderedCyrillic;
char [] orderedGurmukhi;
char [] orderedGujarati;
char [] orderedGeorgian;
+
static readonly char [] orderedTamilConsonants = new char [] {
// based on traditional Tamil consonants, except for
// Grantha (where Microsoft breaks traditionalism).
@@ -154,6 +215,8 @@
Result.WriteLine ("static int [] level2 = new int [] {");
for (int i = 0; i < map.Length; i++) {
int value = map [i].Level2;
+ if (map [i].Category == 0xE)
+ value |= diacritical [i];
if (value == 0)
Result.Write ("0,");
else
@@ -250,6 +313,10 @@
if (s.IndexOf ("SMALL CAPITAL") > 0)
isSmallCapital [cp] = true;
+ for (int d = 0; d < diacritics.Length; d++)
+ if (s.IndexOf (diacritics [d]) > 0)
+ diacritical [cp] |= diacriticWeights [d];
+
// normalizationType
string decomp = values [4];
idx = decomp.IndexOf ('<');
@@ -951,10 +1018,10 @@
internal struct CharMapEntry
{
- public readonly byte Category;
- public readonly byte Level1;
- public readonly byte Level2; // It is always single byte.
- public readonly bool Defined;
+ public byte Category;
+ public byte Level1;
+ public byte Level2; // It is always single byte.
+ public bool Defined;
public CharMapEntry (byte category, byte level1, byte level2)
{
_______________________________________________
Mono-patches maillist - Mono-patches@lists.ximian.com
http://lists.ximian.com/mailman/listinfo/mono-patches
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic