1 /* Capitalization rules for HPFS */ 2
3 /* In OS/2, HPFS filenames preserve upper and lower case letter distinctions 4 but filename matching ignores case. That is, creating a file "Foo" 5 actually creates a file named "Foo" which can be looked up as "Foo", 6 "foo", or "FOO", among other possibilities. 7
8 Also, HPFS is internationalized -- a table giving the uppercase 9 equivalent of every character is stored in the filesystem, so that 10 any national character set may be used. If several different 11 national character sets are in use, several tables are stored 12 in the filesystem. 13
14 It would be perfectly reasonable for Linux HPFS to act as a Unix 15 filesystem and match "Foo" only if asked for "Foo" exactly. But 16 the sort order of HPFS directories is case-insensitive, so Linux 17 still has to know the capitalization rules used by OS/2. Because 18 of this, it turns out to be more natural for us to be case-insensitive 19 than not. 20
21 Currently the standard character set used by Linux is Latin-1. 22 Work is underway to permit people to use UTF-8 instead, therefore 23 all code that depends on the character set is segregated here. 24
25 (It would be wonderful if Linux HPFS could be independent of what 26 character set is in use on the Linux side, but because of the 27 necessary case folding this is impossible.) 28
29 There is a map from Latin-1 into code page 850 for every printing 30 character in Latin-1. Most, maybe all, OS/2 installations have code 31 page 850 available, and surely all (on PC hardware) have 437 available. 32
33 It is not clear exactly how HPFS.IFS handles the situation when 34 multiple code pages are in use. Experiments show that 35
36 - tables on the disk give uppercasing rules for the installed code pages 37
38 - each directory entry is tagged with what code page was current 39 when that name was created 40
41 - doing just CHCP, without changing what's on the disk in any way, 42 can change what DIR reports, and what name a case-folded match 43 will match. 44
45 This means, I think, that HPFS.IFS operates in the current code 46 page, without regard to the uppercasing information recorded in 47 the tables on the disk. It does record the uppercasing rules 48 it used, perhaps for alien operating systems such as us, but it 49 does not appear to use them itself. 50
51 So: Linux, a Latin-1 system, will operate in code page 850. We 52 recode between 850 and Latin-1 when dealing with the names actually 53 on the disk. We don't use the uppercasing tables either. 54
55 In a hypothetical UTF-8 implementation, one reasonable way to 56 proceed that matches OS/2 (for least surprise) is: do case 57 translation in UTF-8, and recode to/from one of the code pages 58 available on the mounted filesystem. Reject as invalid any name 59 containing chars that can't be represented on disk by one of the 60 code pages OS/2 is using. Recoding from on-disk names to UTF-8 61 could use the code page tags, though this is not what OS/2 does. */ 62
63 staticconstunsignedchartb_cp850_to_latin1[128] =
64 { 65 199, 252, 233, 226, 228, 224, 229, 231,
66 234, 235, 232, 239, 238, 236, 196, 197,
67 201, 230, 198, 244, 246, 242, 251, 249,
68 255, 214, 220, 248, 163, 216, 215, 159,
69 225, 237, 243, 250, 241, 209, 170, 186,
70 191, 174, 172, 189, 188, 161, 171, 187,
71 155, 156, 157, 144, 151, 193, 194, 192,
72 169, 135, 128, 131, 133, 162, 165, 147,
73 148, 153, 152, 150, 145, 154, 227, 195,
74 132, 130, 137, 136, 134, 129, 138, 164,
75 240, 208, 202, 203, 200, 158, 205, 206,
76 207, 149, 146, 141, 140, 166, 204, 139,
77 211, 223, 212, 210, 245, 213, 181, 254,
78 222, 218, 219, 217, 253, 221, 175, 180,
79 173, 177, 143, 190, 182, 167, 247, 184,
80 176, 168, 183, 185, 179, 178, 142, 160,
81 };
82
83 #if 0
84 staticconstunsignedchartb_latin1_to_cp850[128] =
85 { 86 186, 205, 201, 187, 200, 188, 204, 185,
87 203, 202, 206, 223, 220, 219, 254, 242,
88 179, 196, 218, 191, 192, 217, 195, 180,
89 194, 193, 197, 176, 177, 178, 213, 159,
90 255, 173, 189, 156, 207, 190, 221, 245,
91 249, 184, 166, 174, 170, 240, 169, 238,
92 248, 241, 253, 252, 239, 230, 244, 250,
93 247, 251, 167, 175, 172, 171, 243, 168,
94 183, 181, 182, 199, 142, 143, 146, 128,
95 212, 144, 210, 211, 222, 214, 215, 216,
96 209, 165, 227, 224, 226, 229, 153, 158,
97 157, 235, 233, 234, 154, 237, 232, 225,
98 133, 160, 131, 198, 132, 134, 145, 135,
99 138, 130, 136, 137, 141, 161, 140, 139,
100 208, 164, 149, 162, 147, 228, 148, 246,
101 155, 151, 163, 150, 129, 236, 231, 152,
102 };
103 #endif 104
105 staticinlineunsignedlatin1_upcase (unsignedc)
/* */ 106 { 107 if (c - (unsignedchar) 'a' <= (unsignedchar) 'z' - (unsignedchar) 'a'
108 || (c - (unsignedchar) '`' <= (unsignedchar) '~' - (unsignedchar) '`'
109 && c != (unsignedchar) 'w'))
110 returnc - (unsignedchar) 'a' + (unsignedchar) 'A';
111 else 112 returnc;
113 } 114
115 staticinlineunsignedlatin1_downcase (unsignedc)
/* */ 116 { 117 if (c - (unsignedchar) 'A' <= (unsignedchar) 'Z' - (unsignedchar) 'A'
118 || (c - (unsignedchar) '@' <= (unsignedchar) '^' - (unsignedchar) '@'
119 && c != (unsignedchar) 'W'))
120 returnc + (unsignedchar) 'a' - (unsignedchar) 'A';
121 else 122 returnc;
123 } 124
125 #if 0
126 staticinlineunsigned latin1_to_cp850 (unsignedc)
/* */ 127 { 128 if ((signed) c - 128 >= 0)
129 returntb_latin1_to_cp850[c - 128];
130 else 131 returnc;
132 } 133 #endif 134
135 staticinlineunsignedcp850_to_latin1 (unsignedc)
/* */ 136 { 137 if ((signed) c - 128 >= 0)
138 returntb_cp850_to_latin1[c - 128];
139 else 140 returnc;
141 } 142
143 unsignedhpfs_char_to_upper_linux (unsignedc)
/* */ 144 { 145 returnlatin1_upcase (cp850_to_latin1 (c));
146 } 147
148 unsignedlinux_char_to_upper_linux (unsignedc)
/* */ 149 { 150 returnlatin1_upcase (c);
151 } 152
153 unsignedhpfs_char_to_lower_linux (unsignedc)
/* */ 154 { 155 returnlatin1_downcase (cp850_to_latin1 (c));
156 } 157
158 unsignedhpfs_char_to_linux (unsignedc)
/* */ 159 { 160 returncp850_to_latin1 (c);
161 }