1 <?php
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
22
23 if(!defined('PHPMORPHY_DIR')) {
24 define('PHPMORPHY_DIR', dirname(__FILE__));
25 }
26
27 require_once(PHPMORPHY_DIR . '/fsa/fsa.php');
28 require_once(PHPMORPHY_DIR . '/graminfo/graminfo.php');
29 require_once(PHPMORPHY_DIR . '/morphiers.php');
30 require_once(PHPMORPHY_DIR . '/gramtab.php');
31 require_once(PHPMORPHY_DIR . '/storage.php');
32 require_once(PHPMORPHY_DIR . '/source.php');
33 require_once(PHPMORPHY_DIR . '/langs_stuff/common.php');
34
35 class phpMorphy_Exception extends Exception { }
36
37
38
39
40 function phpmorphy_overload_mb_funcs($prefix) {
41 $GLOBALS['__phpmorphy_strlen'] = "{$prefix}strlen";
42 $GLOBALS['__phpmorphy_strpos'] = "{$prefix}strpos";
43 $GLOBALS['__phpmorphy_strrpos'] = "{$prefix}strrpos";
44 $GLOBALS['__phpmorphy_substr'] = "{$prefix}substr";
45 $GLOBALS['__phpmorphy_strtolower'] = "{$prefix}strtolower";
46 $GLOBALS['__phpmorphy_strtoupper'] = "{$prefix}strtoupper";
47 $GLOBALS['__phpmorphy_substr_count'] = "{$prefix}substr_count";
48 }
49
50 if(2 == (ini_get('mbstring.func_overload') & 2)) {
51 phpmorphy_overload_mb_funcs('mb_orig_');
52 } else {
53 phpmorphy_overload_mb_funcs('');
54 }
55
56 class phpMorphy_FilesBundle {
57 protected
58 $dir,
59 $lang;
60
61 function phpMorphy_FilesBundle($dirName, $lang) {
62 $this->dir = rtrim($dirName, "\\/" . DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR;
63 $this->setLang($lang);
64 }
65
66 function getLang() {
67 return $this->lang;
68 }
69
70 function setLang($lang) {
71 $this->lang = $GLOBALS['__phpmorphy_strtolower']($lang);
72 }
73
74 function getCommonAutomatFile() {
75 return $this->genFileName('common_aut');
76 }
77
78 function getPredictAutomatFile() {
79 return $this->genFileName('predict_aut');
80 }
81
82 function getGramInfoFile() {
83 return $this->genFileName('morph_data');
84 }
85
86 function getGramInfoAncodesCacheFile() {
87 return $this->genFileName('morph_data_ancodes_cache');
88 }
89
90 function getAncodesMapFile() {
91 return $this->genFileName('morph_data_ancodes_map');
92 }
93
94 function getGramTabFile() {
95 return $this->genFileName('gramtab');
96 }
97
98 function getGramTabFileWithTextIds() {
99 return $this->genFileName('gramtab_txt');
100 }
101
102 function getDbaFile($type) {
103 if(!isset($type)) {
104 $type = 'db3';
105 }
106
107 return $this->genFileName("common_dict_$type");
108 }
109
110 function () {
111 return $this->genFileName('morph_data_header_cache');
112 }
113
114 protected function genFileName($token, $extraExt = null) {
115 return $this->dir . $token . '.' . $this->lang . (isset($extraExt) ? '.' . $extraExt : '') . '.bin';
116 }
117 };
118
119 class phpMorphy_WordDescriptor_Collection_Serializer {
120 function serialize(phpMorphy_WordDescriptor_Collection $collection, $asText) {
121 $result = array();
122
123 foreach($collection as $descriptor) {
124 $result[] = $this->processWordDescriptor($descriptor, $asText);
125 }
126
127 return $result;
128 }
129
130 protected function processWordDescriptor(phpMorphy_WordDescriptor $descriptor, $asText) {
131 $forms = array();
132 $all = array();
133
134 foreach($descriptor as $word_form) {
135 $forms[] = $word_form->getWord();
136 $all[] = $this->serializeGramInfo($word_form, $asText);
137 }
138
139 return array(
140 'forms' => $forms,
141 'all' => $all,
142 'common' => '',
143 );
144 }
145
146 protected function serializeGramInfo(phpMorphy_WordForm $wordForm, $asText) {
147 if($asText) {
148 return $wordForm->getPartOfSpeech() . ' ' . implode(',', $wordForm->getGrammems());
149 } else {
150 return array(
151 'pos' => $wordForm->getPartOfSpeech(),
152 'grammems' => $wordForm->getGrammems()
153 );
154 }
155 }
156 }
157
158 class phpMorphy {
159 const RESOLVE_ANCODES_AS_TEXT = 0;
160 const RESOLVE_ANCODES_AS_DIALING = 1;
161 const RESOLVE_ANCODES_AS_INT = 2;
162
163 const NORMAL = 0;
164 const IGNORE_PREDICT = 2;
165 const ONLY_PREDICT = 3;
166
167 const PREDICT_BY_NONE = 'none';
168 const PREDICT_BY_SUFFIX = 'by_suffix';
169 const PREDICT_BY_DB = 'by_db';
170
171 protected
172 $storage_factory,
173 $common_fsa,
174 $common_source,
175 $predict_fsa,
176 $options,
177
178
179
180
181
182
183
184
185 $helper,
186 $last_prediction_type
187 ;
188
189 function __construct($dir, $lang = null, $options = array()) {
190 $this->options = $options = $this->repairOptions($options);
191
192
193 if($dir instanceof phpMorphy_FilesBundle && is_array($lang)) {
194 $this->initOldStyle($dir, $lang);
195 } else {
196 $this->initNewStyle($this->createFilesBundle($dir, $lang), $options);
197 }
198
199 $this->last_prediction_type = self::PREDICT_BY_NONE;
200 }
201
202 203 204
205 function getCommonMorphier() {
206 return $this->__common_morphier;
207 }
208
209 210 211
212 function getPredictBySuffixMorphier() {
213 return $this->__predict_by_suf_morphier;
214 }
215
216 217 218
219 function getPredictByDatabaseMorphier() {
220 return $this->__predict_by_db_morphier;
221 }
222
223 224 225
226 function getBulkMorphier() {
227 return $this->__bulk_morphier;
228 }
229
230 231 232
233 function getEncoding() {
234 return $this->helper->getGramInfo()->getEncoding();
235 }
236
237 238 239
240 function getLocale() {
241 return $this->helper->getGramInfo()->getLocale();
242 }
243
244 245 246
247 function getGrammemsProvider() {
248 return clone $this->__grammems_provider;
249 }
250
251 252 253
254 function getDefaultGrammemsProvider() {
255 return $this->__grammems_provider;
256 }
257
258 259 260
261 function getShmCache() {
262 return $this->storage_factory->getShmCache();
263 }
264
265 266 267
268 function isLastPredicted() {
269 return self::PREDICT_BY_NONE !== $this->last_prediction_type;
270 }
271
272 function getLastPredictionType() {
273 return $this->last_prediction_type;
274 }
275
276 277 278 279 280
281 function findWord($word, $type = self::NORMAL) {
282 if(is_array($word)) {
283 $result = array();
284
285 foreach($word as $w) {
286 $result[$w] = $this->invoke('getWordDescriptor', $w, $type);
287 }
288
289 return $result;
290 } else {
291 return $this->invoke('getWordDescriptor', $word, $type);
292 }
293 }
294
295 296 297 298 299 300 301
302 function lemmatize($word, $type = self::NORMAL) {
303 return $this->getBaseForm($word, $type);
304 }
305
306 307 308 309 310
311 function getBaseForm($word, $type = self::NORMAL) {
312 return $this->invoke('getBaseForm', $word, $type);
313 }
314
315 316 317 318 319
320 function getAllForms($word, $type = self::NORMAL) {
321 return $this->invoke('getAllForms', $word, $type);
322 }
323
324 325 326 327 328
329 function getPseudoRoot($word, $type = self::NORMAL) {
330 return $this->invoke('getPseudoRoot', $word, $type);
331 }
332
333 334 335 336 337
338 function getPartOfSpeech($word, $type = self::NORMAL) {
339 return $this->invoke('getPartOfSpeech', $word, $type);
340 }
341
342 343 344 345 346
347 function getAllFormsWithAncodes($word, $type = self::NORMAL) {
348 return $this->invoke('getAllFormsWithAncodes', $word, $type);
349 }
350
351 352 353 354 355 356
357 function getAllFormsWithGramInfo($word, $asText = true, $type = self::NORMAL) {
358 if(false === ($result = $this->findWord($word, $type))) {
359 return false;
360 }
361
362 $asText = (bool)$asText;
363
364 if(is_array($word)) {
365 $out = array();
366
367 foreach($result as $w => $r) {
368 if(false !== $r) {
369 $out[$w] = $this->processWordsCollection($r, $asText);
370 } else {
371 $out[$w] = false;
372 }
373 }
374
375 return $out;
376 } else {
377 return $this->processWordsCollection($result, $asText);
378 }
379 }
380
381 382 383 384 385
386 function getAncode($word, $type = self::NORMAL) {
387 return $this->invoke('getAncode', $word, $type);
388 }
389
390 391 392 393 394
395 function getGramInfo($word, $type = self::NORMAL) {
396 return $this->invoke('getGrammarInfo', $word, $type);
397 }
398
399 400 401 402 403
404 function getGramInfoMergeForms($word, $type = self::NORMAL) {
405 return $this->invoke('getGrammarInfoMergeForms', $word, $type);
406 }
407
408 protected function getAnnotForWord($word, $type) {
409 return $this->invoke('getAnnot', $word, $type);
410 }
411
412 413 414 415 416 417 418 419 420
421 function castFormByAncode($word, $ancode, $commonAncode = null, $returnOnlyWord = false, $callback = null, $type = self::NORMAL) {
422 $resolver = $this->helper->getAncodesResolver();
423
424 $common_ancode_id = $resolver->unresolve($commonAncode);
425 $ancode_id = $resolver->unresolve($ancode);
426
427 $data = $this->helper->getGrammemsAndPartOfSpeech($ancode_id);
428
429 if(isset($common_ancode_id)) {
430 $data[1] = array_merge($data[1], $this->helper->getGrammems($common_ancode_id));
431 }
432
433 return $this->castFormByGramInfo(
434 $word,
435 $data[0],
436 $data[1],
437 $returnOnlyWord,
438 $callback,
439 $type
440 );
441 }
442
443 444 445 446 447 448 449 450 451
452 function castFormByGramInfo($word, $partOfSpeech, $grammems, $returnOnlyWord = false, $callback = null, $type = self::NORMAL) {
453 if(false === ($annot = $this->getAnnotForWord($word, $type))) {
454 return false;
455 }
456
457 return $this->helper->castFormByGramInfo($word, $annot, $partOfSpeech, $grammems, $returnOnlyWord, $callback);
458 }
459
460 461 462 463 464 465 466 467 468
469 function castFormByPattern($word, $patternWord, phpMorphy_GrammemsProvider_Interface $grammemsProvider = null, $returnOnlyWord = false, $callback = null, $type = self::NORMAL) {
470 if(false === ($word_annot = $this->getAnnotForWord($word, $type))) {
471 return false;
472 }
473
474 if(!isset($grammemsProvider)) {
475 $grammemsProvider = $this->__grammems_provider;
476 }
477
478 $result = array();
479
480 foreach($this->getGramInfo($patternWord, $type) as $paradigm) {
481 foreach($paradigm as $grammar) {
482 $pos = $grammar['pos'];
483
484 $essential_grammems = $grammemsProvider->getGrammems($pos);
485
486 $grammems = false !== $essential_grammems ?
487 array_intersect($grammar['grammems'], $essential_grammems):
488 $grammar['grammems'];
489
490 $res = $this->helper->castFormByGramInfo(
491 $word,
492 $word_annot,
493 $pos,
494 $grammems,
495 $returnOnlyWord,
496 $callback,
497 $type
498 );
499
500 if(count($res)) {
501 $result = array_merge($result, $res);
502 }
503 }
504 }
505
506 return $returnOnlyWord ? array_unique($result) : $result;
507 }
508
509
510
511 protected function processWordsCollection(phpMorphy_WordDescriptor_Collection $collection, $asText) {
512 return $this->__word_descriptor_serializer->serialize($collection, $asText);
513 }
514
515 protected function invoke($method, $word, $type) {
516 $this->last_prediction_type = self::PREDICT_BY_NONE;
517
518 if($type === self::ONLY_PREDICT) {
519 if(is_array($word)) {
520 $result = array();
521
522 foreach($word as $w) {
523 $result[$w] = $this->predictWord($method, $w);
524 }
525
526 return $result;
527 } else {
528 return $this->predictWord($method, $word);
529 }
530 }
531
532 if(is_array($word)) {
533 $result = $this->__bulk_morphier->$method($word);
534
535 if($type !== self::IGNORE_PREDICT) {
536 $not_found = $this->__bulk_morphier->getNotFoundWords();
537
538 for($i = 0, $c = count($not_found); $i < $c; $i++) {
539 $word = $not_found[$i];
540
541 $result[$word] = $this->predictWord($method, $word);
542 }
543 } else {
544 for($i = 0, $c = count($not_found); $i < $c; $i++) {
545 $result[$not_found[$i]] = false;
546 }
547 }
548
549 return $result;
550 } else {
551 if(false === ($result = $this->__common_morphier->$method($word))) {
552 if($type !== self::IGNORE_PREDICT) {
553 return $this->predictWord($method, $word);
554 }
555 }
556
557 return $result;
558 }
559 }
560
561 protected function predictWord($method, $word) {
562 if(false !== ($result = $this->__predict_by_suf_morphier->$method($word))) {
563 $this->last_prediction_type = self::PREDICT_BY_SUFFIX;
564
565 return $result;
566 }
567
568 if(false !== ($result = $this->__predict_by_db_morphier->$method($word))) {
569 $this->last_prediction_type = self::PREDICT_BY_DB;
570
571 return $result;
572 }
573
574 return false;
575 }
576
577
578
579
580 protected function initNewStyle(phpMorphy_FilesBundle $bundle, $options) {
581 $this->options = $options = $this->repairOptions($options);
582 $storage_type = $options['storage'];
583
584 $storage_factory = $this->storage_factory = $this->createStorageFactory($options['shm']);
585 $graminfo_as_text = $this->options['graminfo_as_text'];
586
587
588 $this->common_fsa = $this->createFsa($storage_factory->open($storage_type, $bundle->getCommonAutomatFile(), false), false);
589 $this->predict_fsa = $this->createFsa($storage_factory->open($storage_type, $bundle->getPredictAutomatFile(), true), true);
590
591
592 $graminfo = $this->createGramInfo($storage_factory->open($storage_type, $bundle->getGramInfoFile(), true), $bundle);
593
594
595 $gramtab = $this->createGramTab(
596 $storage_factory->open(
597 $storage_type,
598 $graminfo_as_text ? $bundle->getGramTabFileWithTextIds() : $bundle->getGramTabFile(),
599 true
600 )
601 );
602
603
604
605
606 $this->helper = $this->createMorphierHelper($graminfo, $gramtab, $graminfo_as_text, $bundle);
607 }
608
609 protected function createCommonSource(phpMorphy_FilesBundle $bundle, $opts) {
610 $type = $opts['type'];
611
612 switch($type) {
613 case PHPMORPHY_SOURCE_FSA:
614 return new phpMorphy_Source_Fsa($this->common_fsa);
615 case PHPMORPHY_SOURCE_DBA:
616 return new phpMorphy_Source_Dba(
617 $bundle->getDbaFile($this->getDbaHandlerName(@$opts['opts']['handler'])),
618 $opts['opts']
619 );
620 default:
621 throw new phpMorphy_Exception("Unknown source type given '$type'");
622 }
623 }
624
625 protected function getDbaHandlerName($name) {
626 return isset($name) ? $name : phpMorphy_Source_Dba::getDefaultHandler();
627 }
628
629 protected function initOldStyle(phpMorphy_FilesBundle $bundle, $options) {
630 $options = $this->repairOptions($options);
631
632 switch($bundle->getLang()) {
633 case 'rus':
634 $bundle->setLang('ru_RU');
635 break;
636 case 'eng':
637 $bundle->setLang('en_EN');
638 break;
639 case 'ger':
640 $bundle->setLang('de_DE');
641 break;
642 }
643
644 $this->initNewStyle($bundle, $options);
645 }
646
647 protected function repairOldOptions($options) {
648 $defaults = array(
649 'predict_by_suffix' => false,
650 'predict_by_db' => false,
651 );
652
653 return (array)$options + $defaults;
654 }
655
656 protected function repairSourceOptions($options) {
657 $defaults = array(
658 'type' => PHPMORPHY_SOURCE_FSA,
659 'opts' => null
660 );
661
662 return (array)$options + $defaults;
663 }
664
665 protected function repairOptions($options) {
666 $defaults = array(
667 'shm' => array(),
668 'graminfo_as_text' => true,
669 'storage' => PHPMORPHY_STORAGE_FILE,
670 'common_source' => $this->repairSourceOptions(@$options['common_source']),
671 'predict_by_suffix' => true,
672 'predict_by_db' => true,
673 'use_ancodes_cache' => false,
674 'resolve_ancodes' => self::RESOLVE_ANCODES_AS_TEXT
675 );
676
677 return (array)$options + $defaults;
678 }
679
680 function __get($name) {
681 switch($name) {
682 case '__predict_by_db_morphier':
683 $this->__predict_by_db_morphier = $this->createPredictByDbMorphier(
684 $this->predict_fsa,
685 $this->helper
686 );
687
688 break;
689 case '__predict_by_suf_morphier':
690 $this->__predict_by_suf_morphier = $this->createPredictBySuffixMorphier(
691 $this->common_fsa,
692 $this->helper
693 );
694
695 break;
696 case '__bulk_morphier':
697 $this->__bulk_morphier = $this->createBulkMorphier(
698 $this->common_fsa,
699 $this->helper
700 );
701
702 break;
703 case '__common_morphier':
704 $this->__common_morphier = $this->createCommonMorphier(
705 $this->common_fsa,
706 $this->helper
707 );
708
709 break;
710
711 case '__word_descriptor_serializer':
712 $this->__word_descriptor_serializer = $this->createWordDescriptorSerializer();
713 break;
714 case '__grammems_provider':
715 $this->__grammems_provider = $this->createGrammemsProvider();
716 break;
717 default:
718 throw new phpMorphy_Exception("Invalid prop name '$name'");
719 }
720
721 return $this->$name;
722 }
723
724
725
726
727 function createGrammemsProvider() {
728 return phpMorphy_GrammemsProvider_Factory::create($this);
729 }
730
731 protected function createWordDescriptorSerializer() {
732 return new phpMorphy_WordDescriptor_Collection_Serializer();
733 }
734
735 protected function createFilesBundle($dir, $lang) {
736 return new phpMorphy_FilesBundle($dir, $lang);
737 }
738
739 protected function createStorageFactory($options) {
740 return new phpMorphy_Storage_Factory($options);
741 }
742
743 protected function createFsa(phpMorphy_Storage $storage, $lazy) {
744 return phpMorphy_Fsa::create($storage, $lazy);
745 }
746
747 protected function createGramInfo(phpMorphy_Storage $graminfoFile, phpMorphy_FilesBundle $bundle) {
748
749
750
751 $result = new phpMorphy_GramInfo_RuntimeCaching(
752 new phpMorphy_GramInfo_Proxy_WithHeader(
753 $graminfoFile,
754 $bundle->getGramInfoHeaderCacheFile()
755 )
756 );
757
758 if($this->options['use_ancodes_cache']) {
759 return new phpMorphy_GramInfo_AncodeCache(
760 $result,
761 $this->storage_factory->open(
762 $this->options['storage'],
763 $bundle->getGramInfoAncodesCacheFile(),
764 true
765 )
766 );
767 } else {
768 return $result;
769 }
770 }
771
772 protected function createGramTab(phpMorphy_Storage $storage) {
773 return new phpMorphy_GramTab_Proxy($storage);
774 }
775
776 protected function createAncodesResolverInternal(phpMorphy_GramTab_Interface $gramtab, phpMorphy_FilesBundle $bundle) {
777 switch($this->options['resolve_ancodes']) {
778 case self::RESOLVE_ANCODES_AS_TEXT:
779 return array(
780 'phpMorphy_AncodesResolver_ToText',
781 array($gramtab)
782 );
783 case self::RESOLVE_ANCODES_AS_INT:
784 return array(
785 'phpMorphy_AncodesResolver_AsIs',
786 array()
787 );
788 case self::RESOLVE_ANCODES_AS_DIALING:
789 return array(
790 'phpMorphy_AncodesResolver_ToDialingAncodes',
791 array(
792 $this->storage_factory->open(
793 $this->options['storage'],
794 $bundle->getAncodesMapFile(),
795 true
796 )
797 )
798 );
799 default:
800 throw new phpMorphy_Exception("Invalid resolve_ancodes option, valid values are RESOLVE_ANCODES_AS_DIALING, RESOLVE_ANCODES_AS_INT, RESOLVE_ANCODES_AS_TEXT");
801 }
802 }
803
804 protected function createAncodesResolver(phpMorphy_GramTab_Interface $gramtab, phpMorphy_FilesBundle $bundle, $lazy) {
805 $result = $this->createAncodesResolverInternal($gramtab, $bundle);
806
807 if($lazy) {
808 return new phpMorphy_AncodesResolver_Proxy($result[0], $result[1]);
809 } else {
810 return phpMorphy_AncodesResolver_Proxy::instantinate($result[0], $result[1]);
811 }
812 }
813
814 protected function createMorphierHelper(
815 phpMorphy_GramInfo_Interace $graminfo,
816 phpMorphy_GramTab_Interface $gramtab,
817 $graminfoAsText,
818 phpMorphy_FilesBundle $bundle
819 ) {
820 return new phpMorphy_Morphier_Helper(
821 $graminfo,
822 $gramtab,
823 $this->createAncodesResolver($gramtab, $bundle, true),
824 $graminfoAsText
825 );
826 }
827
828 protected function createCommonMorphier(phpMorphy_Fsa_Interface $fsa, phpMorphy_Morphier_Helper $helper) {
829 return new phpMorphy_Morphier_Common($fsa, $helper);
830 }
831
832 protected function createBulkMorphier(phpMorphy_Fsa_Interface $fsa, phpMorphy_Morphier_Helper $helper) {
833 return new phpMorphy_Morphier_Bulk($fsa, $helper);
834 }
835
836 protected function createPredictByDbMorphier(phpMorphy_Fsa_Interface $fsa, phpMorphy_Morphier_Helper $helper) {
837 if($this->options['predict_by_db']) {
838 return new phpMorphy_Morphier_Predict_Database($fsa, $helper);
839 } else {
840 return new phpMorphy_Morphier_Empty();
841 }
842 }
843
844 protected function createPredictBySuffixMorphier(phpMorphy_Fsa_Interface $fsa, phpMorphy_Morphier_Helper $helper) {
845 if($this->options['predict_by_suffix']) {
846 return new phpMorphy_Morphier_Predict_Suffix($fsa, $helper);
847 } else {
848 return new phpMorphy_Morphier_Empty();
849 }
850 }
851 };
852
[Raise a SilverStripe Framework issue/bug](https://github.com/silverstripe/silverstripe-framework/issues/new)
- [Raise a SilverStripe CMS issue/bug](https://github.com/silverstripe/silverstripe-cms/issues/new)
- Please use the
Silverstripe Forums to ask development related questions.
-