1 <?php
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
22
23 abstract class phpMorphy_UnicodeHelper {
24 protected static $cache = array();
25
26 static function create($encoding) {
27 $encoding = $GLOBALS['__phpmorphy_strtolower']($encoding);
28
29 if(isset(self::$cache[$encoding])) {
30 return self::$cache[$encoding];
31 }
32
33 $result = self::doCreate($encoding);
34
35 self::$cache[$encoding] = $result;
36
37 return $result;
38 }
39
40 protected static function doCreate($encoding) {
41 if(preg_match('~^(utf|ucs)(-)?([0-9]+)(-)?(le|be)?$~', $encoding, $matches)) {
42 $utf_type = $matches[1];
43 $utf_base = (int)$matches[3];
44 $endiannes = '';
45
46 switch($utf_type) {
47 case 'utf':
48 if(!in_array($utf_base, array(8, 16, 32))) {
49 throw new phpMorphy_Exception('Invalid utf base');
50 }
51
52 break;
53 case 'ucs':
54 if(!in_array($utf_base, array(2, 4))) {
55 throw new phpMorphy_Exception('Invalid ucs base');
56 }
57
58 break;
59 default: throw new phpMorphy_Exception('Internal error');
60 }
61
62 if($utf_base > 8 || 'ucs' === $utf_type) {
63 if(isset($matches[5])) {
64 $endiannes = $matches[5] == 'be' ? 'be' : 'le';
65 } else {
66 $tmp = pack('L', 1);
67 $endiannes = ord($tmp[0]) == 0 ? 'be' : 'le';
68 }
69 }
70
71
72 if($utf_type == 'ucs' || $utf_base > 8) {
73 $encoding_name = "$utf_type-$utf_base$endiannes";
74 } else {
75 $encoding_name = "$utf_type-$utf_base";
76 }
77
78 $clazz = "phpMorphy_UnicodeHelper_" . str_replace('-', '_', $encoding_name);
79
80 return new $clazz($encoding_name);
81 } else {
82 return new phpMorphy_UnicodeHelper_singlebyte($encoding);
83 }
84 }
85
86 abstract function firstCharSize($str);
87 abstract function strrev($str);
88 abstract function strlen($str);
89 abstract function fixTrailing($str);
90 }
91
92 abstract class phpMorphy_UnicodeHelper_Base extends phpMorphy_UnicodeHelper {
93 protected static
94 $ICONV,
95 $MB,
96 $STRLEN_FOO
97 ;
98
99 protected
100 $encoding,
101 $strlen_foo,
102 $iconv,
103 $mb
104 ;
105
106 protected function __construct($encoding) {
107 $this->encoding = $encoding;
108
109 if(!isset(self::$ICONV) || !isset(self::$MB)) {
110 if(false !== (self::$ICONV = extension_loaded('iconv'))) {
111 self::$STRLEN_FOO = 'iconv_strlen';
112 } else if(false !== (self::$MB = extension_loaded('mbstring'))) {
113 self::$STRLEN_FOO = 'mb_strlen';
114 }
115 }
116 }
117
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
133
134 function strlen($str) {
135 if(isset(self::$STRLEN_FOO)) {
136 $foo = self::$STRLEN_FOO;
137 return $foo($str, $this->encoding);
138 } else {
139 return $this->php_strlen($str);
140 }
141 }
142
143 protected abstract function php_strlen($str);
144 }
145
146 class phpMorphy_UnicodeHelper_MultiByteFixed extends phpMorphy_UnicodeHelper_Base {
147 protected
148 $size;
149
150 protected function __construct($encoding, $size) {
151 parent::__construct($encoding);
152 $this->size = $size;
153 }
154
155 function firstCharSize($str) {
156 return $this->size;
157 }
158
159 function strrev($str) {
160 return implode('', array_reverse(str_split($str, $this->size)));
161 }
162
163 protected function php_strlen($str) {
164 return $GLOBALS['__phpmorphy_strlen']($str) / $this->size;
165 }
166
167 function fixTrailing($str) {
168 $len = $GLOBALS['__phpmorphy_strlen']($str);
169
170 if(($len % $this->size) > 0) {
171 return $GLOBALS['__phpmorphy_substr']($str, 0, floor($len / $this->size) * $this->size);
172 }
173
174 return $str;
175 }
176 }
177
178
179 class phpMorphy_UnicodeHelper_singlebyte extends phpMorphy_UnicodeHelper_Base {
180 function firstCharSize($str) {
181 return 1;
182 }
183
184 function strrev($str) {
185 return strrev($str);
186 }
187
188 function strlen($str) {
189 return $GLOBALS['__phpmorphy_strlen']($str);
190 }
191
192 function fixTrailing($str) {
193 return $str;
194 }
195
196 protected function php_strlen($str) {
197 return $GLOBALS['__phpmorphy_strlen']($str);
198 }
199 }
200
201
202 class phpMorphy_UnicodeHelper_utf_8 extends phpMorphy_UnicodeHelper_Base {
203 protected
204 $tails_length;
205
206 protected function __construct($encoding) {
207 parent::__construct($encoding);
208
209 $this->tails_length = $this->getTailsLength();
210 }
211
212 function firstCharSize($str) {
213 return 1 + $this->tails_length[ord($str[0])];
214 }
215
216 function strrev($str) {
217 preg_match_all('/./us', $str, $matches);
218 return implode('', array_reverse($matches[0]));
219 220 221 222 223 224 225 226 227 228 229 230 231
232 }
233
234 function fixTrailing($str) {
235 $strlen = $GLOBALS['__phpmorphy_strlen']($str);
236
237 if(!$strlen) {
238 return '';
239 }
240
241 $ord = ord($str[$strlen - 1]);
242
243 if(($ord & 0x80) == 0) {
244 return $str;
245 }
246
247 for($i = $strlen - 1; $i >= 0; $i--) {
248 $ord = ord($str[$i]);
249
250 if(($ord & 0xC0) == 0xC0) {
251 $diff = $strlen - $i;
252 $seq_len = $this->tails_length[$ord] + 1;
253
254 $miss = $seq_len - $diff;
255
256 if($miss) {
257 return $GLOBALS['__phpmorphy_substr']($str, 0, -($seq_len - $miss));
258 } else {
259 return $str;
260 }
261 }
262 }
263
264 return '';
265 }
266
267 protected function php_strlen($str) {
268 preg_match_all('/./us', $str, $matches);
269 return count($matches[0]);
270 }
271
272 protected function getTailsLength() {
273 return array(
274 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
275 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
276 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
277 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
278 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
279 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
280 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
281 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
282 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
283 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
284 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
285 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
286 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
287 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
288 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
289 3,3,3,3,3,3,3,3, 4,4,4,4,5,5,0,0
290 );
291 }
292 }
293
294
295 class phpMorphy_UnicodeHelper_utf_16_Base extends phpMorphy_UnicodeHelper_Base {
296 protected
297 $is_be,
298 $char_fmt;
299
300 protected function __construct($encoding, $isBigEndian) {
301 parent::__construct($encoding);
302
303 $this->is_be = (bool)$isBigEndian;
304 $this->char_fmt = $isBigEndian ? 'n' : 'v';
305 }
306
307 function firstCharSize($str) {
308 list(, $ord) = unpack($this->char_fmt, $str);
309
310 return $ord >= 0xD800 && $ord <= 0xDFFF ? 4 : 2;
311 }
312
313 function strrev($str) {
314 $result = array();
315
316 $count = $GLOBALS['__phpmorphy_strlen']($str) / 2;
317 $fmt = $this->char_fmt . $count;
318
319 $words = array_reverse(unpack($fmt, $str));
320
321 for($i = 0; $i < $count; $i++) {
322 $ord = $words[$i];
323
324 if($ord >= 0xD800 && $ord <= 0xDFFF) {
325
326 $t = $words[$i];
327 $words[$i] = $words[$i + 1];
328
329 $i++;
330 $words[$i] = $t;
331 }
332 }
333
334 array_unshift($words, $fmt);
335
336 return call_user_func_array('pack', $words);
337 }
338
339 function fixTrailing($str) {
340 $strlen = $GLOBALS['__phpmorphy_strlen']($str);
341
342 if($strlen & 1) {
343 $strlen--;
344 $str = $GLOBALS['__phpmorphy_substr']($str, 0, $strlen);
345 }
346
347 if($strlen < 2) {
348 return '';
349 }
350
351 list(, $ord) = unpack($this->char_fmt, $GLOBALS['__phpmorphy_substr']($str, -2, 2));
352
353 if($this->isSurrogate($ord)) {
354 if($strlen < 4) {
355 return '';
356 }
357
358 list(, $ord) = unpack($this->char_fmt, $GLOBALS['__phpmorphy_substr']($str, -4, 2));
359
360 if($this->isSurrogate($ord)) {
361
362 return $str;
363 } else {
364 return $GLOBALS['__phpmorphy_substr']($str, 0, -2);
365 }
366 }
367
368 return $str;
369 }
370
371 protected function php_strlen($str) {
372 $count = $GLOBALS['__phpmorphy_strlen']($str) / 2;
373 $fmt = $this->char_fmt . $count;
374
375 foreach(unpack($fmt, $str) as $ord) {
376 if($ord >= 0xD800 && $ord <= 0xDFFF) {
377 $count--;
378 }
379 }
380
381 return $count;
382 }
383
384 protected function isSurrogate($ord) {
385 return $ord >= 0xD800 && $ord <= 0xDFFF;
386 }
387 }
388
389 class phpMorphy_UnicodeHelper_utf_16le extends phpMorphy_UnicodeHelper_utf_16_Base {
390 protected function __construct($encoding) {
391 parent::__construct($encoding, false);
392 }
393 }
394
395 class phpMorphy_UnicodeHelper_utf_16be extends phpMorphy_UnicodeHelper_utf_16_Base {
396 protected function __construct($encoding) {
397 parent::__construct($encoding, true);
398 }
399 }
400
401
402 class phpMorphy_UnicodeHelper_utf_32_Base extends phpMorphy_UnicodeHelper_MultiByteFixed {
403 protected function __construct($encoding) { parent::__construct($encoding, 4); }
404 }
405
406 class phpMorphy_UnicodeHelper_utf_32le extends phpMorphy_UnicodeHelper_utf_32_Base { }
407
408 class phpMorphy_UnicodeHelper_utf_32be extends phpMorphy_UnicodeHelper_utf_32_Base { }
409
410
411 class phpMorphy_UnicodeHelper_ucs_2le extends phpMorphy_UnicodeHelper_MultiByteFixed {
412 protected function __construct($encoding) { parent::__construct($encoding, 2); }
413 }
414
415 class phpMorphy_UnicodeHelper_ucs_2be extends phpMorphy_UnicodeHelper_MultiByteFixed {
416 protected function __construct($encoding) { parent::__construct($encoding, 2); }
417 }
418
419 class phpMorphy_UnicodeHelper_ucs_4le extends phpMorphy_UnicodeHelper_MultiByteFixed {
420 protected function __construct($encoding) { parent::__construct($encoding, 4); }
421 }
422
423 class phpMorphy_UnicodeHelper_ucs_4be extends phpMorphy_UnicodeHelper_MultiByteFixed {
424 protected function __construct($encoding) { parent::__construct($encoding, 4); }
425 }
426
[Raise a SilverStripe Framework issue/bug](https://github.com/silverstripe/silverstripe-framework/issues/new)
- [Raise a SilverStripe CMS issue/bug](https://github.com/silverstripe/silverstripe-cms/issues/new)
- Please use the
Silverstripe Forums to ask development related questions.
-