diff --git a/.drone.yml b/.drone.yml index e8f9ba8d269..b5d6294004f 100644 --- a/.drone.yml +++ b/.drone.yml @@ -7,7 +7,7 @@ pipeline: image: joomlaprojects/docker-phpcs commands: - echo $(date) - - /root/.composer/vendor/bin/phpcs --report=full --extensions=php -p --standard=build/phpcs/Joomla . + - /root/.composer/vendor/bin/phpcs --report=full --extensions=php -p --encoding=utf-8 --standard=build/phpcs/Joomla . - echo $(date) initdb: diff --git a/.gitignore b/.gitignore index b1b4ecee9eb..1280353a22d 100644 --- a/.gitignore +++ b/.gitignore @@ -221,6 +221,11 @@ Desktop.ini /libraries/vendor/simplepie/simplepie/build /libraries/vendor/simplepie/simplepie/idn/ReadMe.txt /libraries/vendor/simplepie/simplepie/composer.json +/libraries/vendor/wamania/php-stemmer/.gitignore +/libraries/vendor/wamania/php-stemmer/README.md +/libraries/vendor/wamania/php-stemmer/composer.json +/libraries/vendor/wamania/php-stemmer/phpunit.xml.dist +/libraries/vendor/wamania/php-stemmer/test /libraries/vendor/zendframework/zend-diactoros/.coveralls.yml /libraries/vendor/zendframework/zend-diactoros/CHANGELOG.md /libraries/vendor/zendframework/zend-diactoros/composer.json diff --git a/administrator/components/com_finder/config.xml b/administrator/components/com_finder/config.xml index cb4a383edfa..cb8b6a995a5 100644 --- a/administrator/components/com_finder/config.xml +++ b/administrator/components/com_finder/config.xml @@ -269,29 +269,6 @@ default="0.3" /> - - - - - - - - - - - tokenise($input); + $terms = array_filter($terms); /* * If we have to handle the input as a phrase, that means we don't @@ -158,14 +77,14 @@ class FinderIndexerHelper if ($phrase === true && count($terms) > 1) { // Create tokens from the phrase. - $tokens[] = new FinderIndexerToken($terms, $lang); + $tokens[] = new FinderIndexerToken($terms, $language->language, $language->spacer); } else { // Create tokens from the terms. for ($i = 0, $n = count($terms); $i < $n; $i++) { - $tokens[] = new FinderIndexerToken($terms[$i], $lang); + $tokens[] = new FinderIndexerToken($terms[$i], $language->language); } // Create two and three word phrase tokens from the individual words. @@ -179,7 +98,7 @@ class FinderIndexerHelper if ($i2 < $n && isset($tokens[$i2])) { // Tokenize the two word phrase. - $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' '); + $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $language->language, $language->spacer); $token->derived = true; // Add the token to the stack. @@ -190,7 +109,7 @@ class FinderIndexerHelper if ($i3 < $n && isset($tokens[$i3])) { // Tokenize the three word phrase. - $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' '); + $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $language->language, $language->spacer); $token->derived = true; // Add the token to the stack. @@ -199,22 +118,13 @@ class FinderIndexerHelper } } - if ($store) - { - $cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens); + $cache[$store] = $tokens; - return $cache[$store]; - } - else - { - return count($tokens) > 1 ? $tokens : array_shift($tokens); - } + return $cache[$store]; } /** - * Method to get the base word of a token. This method uses the public - * {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set, - * the original token is returned. + * Method to get the base word of a token. * * @param string $token The token to stem. * @param string $lang The language of the token. @@ -225,31 +135,9 @@ class FinderIndexerHelper */ public static function stem($token, $lang) { - // Trim apostrophes at either end of the token. - $token = trim($token, '\''); + $language = FinderIndexerLanguage::getInstance($lang); - // Trim everything after any apostrophe in the token. - if ($res = explode('\'', $token)) - { - $token = $res[0]; - } - - if (static::$stemmerOK === true) - { - return static::$stemmer->stem($token, $lang); - } - else - { - // Stem the token if we have a valid stemmer to use. - if (static::$stemmer instanceof FinderIndexerStemmer) - { - static::$stemmerOK = true; - - return static::$stemmer->stem($token, $lang); - } - } - - return $token; + return $language->stem($token); } /** diff --git a/administrator/components/com_finder/helpers/indexer/indexer.php b/administrator/components/com_finder/helpers/indexer/indexer.php index b6d38108bac..7181f3b3581 100644 --- a/administrator/components/com_finder/helpers/indexer/indexer.php +++ b/administrator/components/com_finder/helpers/indexer/indexer.php @@ -12,8 +12,8 @@ defined('_JEXEC') or die; use Joomla\String\StringHelper; JLoader::register('FinderIndexerHelper', __DIR__ . '/helper.php'); +JLoader::register('FinderIndexerLanguage', __DIR__ . '/language.php'); JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php'); -JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php'); JLoader::register('FinderIndexerTaxonomy', __DIR__ . '/taxonomy.php'); JLoader::register('FinderIndexerToken', __DIR__ . '/token.php'); @@ -213,12 +213,6 @@ abstract class FinderIndexer static::$profiler = JProfiler::getInstance('FinderIndexer'); } - // Setup the stemmer. - if ($data->options->get('stem', 1) && $data->options->get('stemmer', 'porter_en')) - { - FinderIndexerHelper::$stemmer = FinderIndexerStemmer::getInstance($data->options->get('stemmer', 'porter_en')); - } - // Set the state. static::$state = $data; @@ -471,6 +465,11 @@ abstract class FinderIndexer // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($input, $lang); + if (count($tokens) == 0) + { + return $count; + } + // Add the tokens to the database. $count += $this->addTokensToDb($tokens, $context); diff --git a/administrator/components/com_finder/helpers/indexer/language.php b/administrator/components/com_finder/helpers/indexer/language.php new file mode 100644 index 00000000000..bd7055ca9d7 --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language.php @@ -0,0 +1,146 @@ +language = $locale; + } + + return $instances[$language]; + } + + /** + * Method to tokenise a text string. + * + * @param string $input The input to tokenise. + * + * @return array An array of term strings. + * + * @since __DEPLOY_VERSION__ + */ + public function tokenise($input) + { + $quotes = html_entity_decode('‘’'', ENT_QUOTES, 'UTF-8'); + + /* + * Parsing the string input into terms is a multi-step process. + * + * Regexes: + * 1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma. + * 2. Remove plus, dash, period, and comma characters located before letter characters. + * 3. Remove plus, dash, period, and comma characters located after other characters. + * 4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy. + * 5. Remove orphaned apostrophe, plus, dash, period, and comma characters. + * 6. Remove orphaned quote characters. + * 7. Replace the assorted single quotation marks with the ASCII standard single quotation. + * 8. Remove multiple space characters and replaces with a single space. + */ + $input = StringHelper::strtolower($input); + $input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input); + $input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input); + $input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input); + $input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input); + $input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input); + $input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input); + $input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input); + $input = preg_replace('#\s+#mui', ' ', $input); + $input = trim($input); + + // Explode the normalized string to get the terms. + $terms = explode(' ', $input); + + return $terms; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $token; + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/da.php b/administrator/components/com_finder/helpers/indexer/language/da.php new file mode 100644 index 00000000000..8a9630af9dd --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/da.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\Danish; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/de.php b/administrator/components/com_finder/helpers/indexer/language/de.php new file mode 100644 index 00000000000..cd5e97e609a --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/de.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\German; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/el.php b/administrator/components/com_finder/helpers/indexer/language/el.php new file mode 100644 index 00000000000..49e5b477eea --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/el.php @@ -0,0 +1,1015 @@ +. This is + * derivative work, based on the Greek stemmer for Drupal, see + * https://github.com/magaras/greek_stemmer/blob/master/mod_stemmer.php + */ + +defined('_JEXEC') or die; + +/** + * Greek language support class for the Finder indexer package. + * + * @since __DEPLOY_VERSION__ + */ +class FinderIndexerLanguageel extends FinderIndexerLanguage +{ + /** + * Language locale of the class + * + * @var string + * @since __DEPLOY_VERSION__ + */ + public $language = 'el'; + + /** + * Method to tokenise a text string. It takes into account the odd punctuation commonly used in Greek text, mapping + * it to ASCII punctuation. + * + * Reference: http://www.teicrete.gr/users/kutrulis/Glosika/Stixi.htm + * + * @param string $input The input to tokenise. + * + * @return array An array of term strings. + * + * @since __DEPLOY_VERSION__ + */ + public function tokenise($input) + { + // Replace Greek calligraphic double quotes (various styles) to dumb double quotes + $input = str_replace(['“', '”', '„', '«' ,'»'], '"', $input); + + // Replace Greek calligraphic single quotes (various styles) to dumb single quotes + $input = str_replace(['‘','’','‚'], "'", $input); + + // Replace the middle dot (ano teleia) with a comma, adequate for the purpose of stemming + $input = str_replace('·', ',', $input); + + // Dot and dash (τελεία και παύλα), used to denote the end of a context at the end of a paragraph. + $input = str_replace('.–', '.', $input); + + // Ellipsis, two styles (separate dots or single glyph) + $input = str_replace(['...', '…'], '.', $input); + + // Cross. Marks the death date of a person. Removed. + $input = str_replace('†', '', $input); + + // Star. Reference, supposition word (in philology), birth date of a person. + $input = str_replace('*', '', $input); + + // Paragraph. Indicates change of subject. + $input = str_replace('§', '.', $input); + + // Plus/minus. Shows approximation. Not relevant for the stemmer, hence its conversion to a space. + $input = str_replace('±', ' ', $input); + + return parent::tokenise($input); + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + $token = $this->toUpperCase($token, $w_CASE); + + // Stop-word removal + $stop_words = '/^(ΕΚΟ|ΑΒΑ|ΑΓΑ|ΑΓΗ|ΑΓΩ|ΑΔΗ|ΑΔΩ|ΑΕ|ΑΕΙ|ΑΘΩ|ΑΙ|ΑΙΚ|ΑΚΗ|ΑΚΟΜΑ|ΑΚΟΜΗ|ΑΚΡΙΒΩΣ|ΑΛΑ|ΑΛΗΘΕΙΑ|ΑΛΗΘΙΝΑ|ΑΛΛΑΧΟΥ|ΑΛΛΙΩΣ|ΑΛΛΙΩΤΙΚΑ|' + . 'ΑΛΛΟΙΩΣ|ΑΛΛΟΙΩΤΙΚΑ|ΑΛΛΟΤΕ|ΑΛΤ|ΑΛΩ|ΑΜΑ|ΑΜΕ|ΑΜΕΣΑ|ΑΜΕΣΩΣ|ΑΜΩ|ΑΝ|ΑΝΑ|ΑΝΑΜΕΣΑ|ΑΝΑΜΕΤΑΞΥ|ΑΝΕΥ|ΑΝΤΙ|ΑΝΤΙΠΕΡΑ|ΑΝΤΙΣ|ΑΝΩ|ΑΝΩΤΕΡΩ|ΑΞΑΦΝΑ|' + . 'ΑΠ|ΑΠΕΝΑΝΤΙ|ΑΠΟ|ΑΠΟΨΕ|ΑΠΩ|ΑΡΑ|ΑΡΑΓΕ|ΑΡΕ|ΑΡΚ|ΑΡΚΕΤΑ|ΑΡΛ|ΑΡΜ|ΑΡΤ|ΑΡΥ|ΑΡΩ|ΑΣ|ΑΣΑ|ΑΣΟ|ΑΤΑ|ΑΤΕ|ΑΤΗ|ΑΤΙ|ΑΤΜ|ΑΤΟ|ΑΥΡΙΟ|ΑΦΗ|ΑΦΟΤΟΥ|ΑΦΟΥ|' + . 'ΑΧ|ΑΧΕ|ΑΧΟ|ΑΨΑ|ΑΨΕ|ΑΨΗ|ΑΨΥ|ΑΩΕ|ΑΩΟ|ΒΑΝ|ΒΑΤ|ΒΑΧ|ΒΕΑ|ΒΕΒΑΙΟΤΑΤΑ|ΒΗΞ|ΒΙΑ|ΒΙΕ|ΒΙΗ|ΒΙΟ|ΒΟΗ|ΒΟΩ|ΒΡΕ|ΓΑ|ΓΑΒ|ΓΑΡ|ΓΕΝ|ΓΕΣ||ΓΗ|ΓΗΝ|ΓΙ|ΓΙΑ|' + . 'ΓΙΕ|ΓΙΝ|ΓΙΟ|ΓΚΙ|ΓΙΑΤΙ|ΓΚΥ|ΓΟΗ|ΓΟΟ|ΓΡΗΓΟΡΑ|ΓΡΙ|ΓΡΥ|ΓΥΗ|ΓΥΡΩ|ΔΑ|ΔΕ|ΔΕΗ|ΔΕΙ|ΔΕΝ|ΔΕΣ|ΔΗ|ΔΗΘΕΝ|ΔΗΛΑΔΗ|ΔΗΩ|ΔΙ|ΔΙΑ|ΔΙΑΡΚΩΣ|ΔΙΟΛΟΥ|ΔΙΣ|' + . 'ΔΙΧΩΣ|ΔΟΛ|ΔΟΝ|ΔΡΑ|ΔΡΥ|ΔΡΧ|ΔΥΕ|ΔΥΟ|ΔΩ|ΕΑΜ|ΕΑΝ|ΕΑΡ|ΕΘΗ|ΕΙ|ΕΙΔΕΜΗ|ΕΙΘΕ|ΕΙΜΑΙ|ΕΙΜΑΣΤΕ|ΕΙΝΑΙ|ΕΙΣ|ΕΙΣΑΙ|ΕΙΣΑΣΤΕ|ΕΙΣΤΕ|ΕΙΤΕ|ΕΙΧΑ|ΕΙΧΑΜΕ|' + . 'ΕΙΧΑΝ|ΕΙΧΑΤΕ|ΕΙΧΕ|ΕΙΧΕΣ|ΕΚ|ΕΚΕΙ|ΕΛΑ|ΕΛΙ|ΕΜΠ|ΕΝ|ΕΝΤΕΛΩΣ|ΕΝΤΟΣ|ΕΝΤΩΜΕΤΑΞΥ|ΕΝΩ|ΕΞ|ΕΞΑΦΝΑ|ΕΞΙ|ΕΞΙΣΟΥ|ΕΞΩ|ΕΟΚ|ΕΠΑΝΩ|ΕΠΕΙΔΗ|ΕΠΕΙΤΑ|ΕΠΗ|' + . 'ΕΠΙ|ΕΠΙΣΗΣ|ΕΠΟΜΕΝΩΣ|ΕΡΑ|ΕΣ|ΕΣΑΣ|ΕΣΕ|ΕΣΕΙΣ|ΕΣΕΝΑ|ΕΣΗ|ΕΣΤΩ|ΕΣΥ|ΕΣΩ|ΕΤΙ|ΕΤΣΙ|ΕΥ|ΕΥΑ|ΕΥΓΕ|ΕΥΘΥΣ|ΕΥΤΥΧΩΣ|ΕΦΕ|ΕΦΕΞΗΣ|ΕΦΤ|ΕΧΕ|ΕΧΕΙ|' + . 'ΕΧΕΙΣ|ΕΧΕΤΕ|ΕΧΘΕΣ|ΕΧΟΜΕ|ΕΧΟΥΜΕ|ΕΧΟΥΝ|ΕΧΤΕΣ|ΕΧΩ|ΕΩΣ|ΖΕΑ|ΖΕΗ|ΖΕΙ|ΖΕΝ|ΖΗΝ|ΖΩ|Η|ΗΔΗ|ΗΔΥ|ΗΘΗ|ΗΛΟ|ΗΜΙ|ΗΠΑ|ΗΣΑΣΤΕ|ΗΣΟΥΝ|ΗΤΑ|ΗΤΑΝ|ΗΤΑΝΕ|' + . 'ΗΤΟΙ|ΗΤΤΟΝ|ΗΩ|ΘΑ|ΘΥΕ|ΘΩΡ|Ι|ΙΑ|ΙΒΟ|ΙΔΗ|ΙΔΙΩΣ|ΙΕ|ΙΙ|ΙΙΙ|ΙΚΑ|ΙΛΟ|ΙΜΑ|ΙΝΑ|ΙΝΩ|ΙΞΕ|ΙΞΟ|ΙΟ|ΙΟΙ|ΙΣΑ|ΙΣΑΜΕ|ΙΣΕ|ΙΣΗ|ΙΣΙΑ|ΙΣΟ|ΙΣΩΣ|ΙΩΒ|ΙΩΝ|' + . 'ΙΩΣ|ΙΑΝ|ΚΑΘ|ΚΑΘΕ|ΚΑΘΕΤΙ|ΚΑΘΟΛΟΥ|ΚΑΘΩΣ|ΚΑΙ|ΚΑΝ|ΚΑΠΟΤΕ|ΚΑΠΟΥ|ΚΑΠΩΣ|ΚΑΤ|ΚΑΤΑ|ΚΑΤΙ|ΚΑΤΙΤΙ|ΚΑΤΟΠΙΝ|ΚΑΤΩ|ΚΑΩ|ΚΒΟ|ΚΕΑ|ΚΕΙ|ΚΕΝ|ΚΙ|ΚΙΜ|' + . 'ΚΙΟΛΑΣ|ΚΙΤ|ΚΙΧ|ΚΚΕ|ΚΛΙΣΕ|ΚΛΠ|ΚΟΚ|ΚΟΝΤΑ|ΚΟΧ|ΚΤΛ|ΚΥΡ|ΚΥΡΙΩΣ|ΚΩ|ΚΩΝ|ΛΑ|ΛΕΑ|ΛΕΝ|ΛΕΟ|ΛΙΑ|ΛΙΓΑΚΙ|ΛΙΓΟΥΛΑΚΙ|ΛΙΓΟ|ΛΙΓΩΤΕΡΟ|ΛΙΟ|ΛΙΡ|ΛΟΓΩ|' + . 'ΛΟΙΠΑ|ΛΟΙΠΟΝ|ΛΟΣ|ΛΣ|ΛΥΩ|ΜΑ|ΜΑΖΙ|ΜΑΚΑΡΙ|ΜΑΛΙΣΤΑ|ΜΑΛΛΟΝ|ΜΑΝ|ΜΑΞ|ΜΑΣ|ΜΑΤ|ΜΕ|ΜΕΘΑΥΡΙΟ|ΜΕΙ|ΜΕΙΟΝ|ΜΕΛ|ΜΕΛΕΙ|ΜΕΛΛΕΤΑΙ|ΜΕΜΙΑΣ|ΜΕΝ|ΜΕΣ|' + . 'ΜΕΣΑ|ΜΕΤ|ΜΕΤΑ|ΜΕΤΑΞΥ|ΜΕΧΡΙ|ΜΗ|ΜΗΔΕ|ΜΗΝ|ΜΗΠΩΣ|ΜΗΤΕ|ΜΙ|ΜΙΞ|ΜΙΣ|ΜΜΕ|ΜΝΑ|ΜΟΒ|ΜΟΛΙΣ|ΜΟΛΟΝΟΤΙ|ΜΟΝΑΧΑ|ΜΟΝΟΜΙΑΣ|ΜΙΑ|ΜΟΥ|ΜΠΑ|ΜΠΟΡΕΙ|' + . 'ΜΠΟΡΟΥΝ|ΜΠΡΑΒΟ|ΜΠΡΟΣ|ΜΠΩ|ΜΥ|ΜΥΑ|ΜΥΝ|ΝΑ|ΝΑΕ|ΝΑΙ|ΝΑΟ|ΝΔ|ΝΕΐ|ΝΕΑ|ΝΕΕ|ΝΕΟ|ΝΙ|ΝΙΑ|ΝΙΚ|ΝΙΛ|ΝΙΝ|ΝΙΟ|ΝΤΑ|ΝΤΕ|ΝΤΙ|ΝΤΟ|ΝΥΝ|ΝΩΕ|ΝΩΡΙΣ|ΞΑΝΑ|' + . 'ΞΑΦΝΙΚΑ|ΞΕΩ|ΞΙ|Ο|ΟΑ|ΟΑΠ|ΟΔΟ|ΟΕ|ΟΖΟ|ΟΗΕ|ΟΙ|ΟΙΑ|ΟΙΗ|ΟΚΑ|ΟΛΟΓΥΡΑ|ΟΛΟΝΕΝ|ΟΛΟΤΕΛΑ|ΟΛΩΣΔΙΟΛΟΥ|ΟΜΩΣ|ΟΝ|ΟΝΕ|ΟΝΟ|ΟΠΑ|ΟΠΕ|ΟΠΗ|ΟΠΟ|' + . 'ΟΠΟΙΑΔΗΠΟΤΕ|ΟΠΟΙΑΝΔΗΠΟΤΕ|ΟΠΟΙΑΣΔΗΠΟΤΕ|ΟΠΟΙΔΗΠΟΤΕ|ΟΠΟΙΕΣΔΗΠΟΤΕ|ΟΠΟΙΟΔΗΠΟΤΕ|ΟΠΟΙΟΝΔΗΠΟΤΕ|ΟΠΟΙΟΣΔΗΠΟΤΕ|ΟΠΟΙΟΥΔΗΠΟΤΕ|ΟΠΟΙΟΥΣΔΗΠΟΤΕ|' + . 'ΟΠΟΙΩΝΔΗΠΟΤΕ|ΟΠΟΤΕΔΗΠΟΤΕ|ΟΠΟΥ|ΟΠΟΥΔΗΠΟΤΕ|ΟΠΩΣ|ΟΡΑ|ΟΡΕ|ΟΡΗ|ΟΡΟ|ΟΡΦ|ΟΡΩ|ΟΣΑ|ΟΣΑΔΗΠΟΤΕ|ΟΣΕ|ΟΣΕΣΔΗΠΟΤΕ|ΟΣΗΔΗΠΟΤΕ|ΟΣΗΝΔΗΠΟΤΕ|' + . 'ΟΣΗΣΔΗΠΟΤΕ|ΟΣΟΔΗΠΟΤΕ|ΟΣΟΙΔΗΠΟΤΕ|ΟΣΟΝΔΗΠΟΤΕ|ΟΣΟΣΔΗΠΟΤΕ|ΟΣΟΥΔΗΠΟΤΕ|ΟΣΟΥΣΔΗΠΟΤΕ|ΟΣΩΝΔΗΠΟΤΕ|ΟΤΑΝ|ΟΤΕ|ΟΤΙ|ΟΤΙΔΗΠΟΤΕ|ΟΥ|ΟΥΔΕ|ΟΥΚ|ΟΥΣ|' + . 'ΟΥΤΕ|ΟΥΦ|ΟΧΙ|ΟΨΑ|ΟΨΕ|ΟΨΗ|ΟΨΙ|ΟΨΟ|ΠΑ|ΠΑΛΙ|ΠΑΝ|ΠΑΝΤΟΤΕ|ΠΑΝΤΟΥ|ΠΑΝΤΩΣ|ΠΑΠ|ΠΑΡ|ΠΑΡΑ|ΠΕΙ|ΠΕΡ|ΠΕΡΑ|ΠΕΡΙ|ΠΕΡΙΠΟΥ|ΠΕΡΣΙ|ΠΕΡΥΣΙ|ΠΕΣ|ΠΙ|' + . 'ΠΙΑ|ΠΙΘΑΝΟΝ|ΠΙΚ|ΠΙΟ|ΠΙΣΩ|ΠΙΤ|ΠΙΩ|ΠΛΑΙ|ΠΛΕΟΝ|ΠΛΗΝ|ΠΛΩ|ΠΜ|ΠΟΑ|ΠΟΕ|ΠΟΛ|ΠΟΛΥ|ΠΟΠ|ΠΟΤΕ|ΠΟΥ|ΠΟΥΘΕ|ΠΟΥΘΕΝΑ|ΠΡΕΠΕΙ|ΠΡΙ|ΠΡΙΝ|ΠΡΟ|' + . 'ΠΡΟΚΕΙΜΕΝΟΥ|ΠΡΟΚΕΙΤΑΙ|ΠΡΟΠΕΡΣΙ|ΠΡΟΣ|ΠΡΟΤΟΥ|ΠΡΟΧΘΕΣ|ΠΡΟΧΤΕΣ|ΠΡΩΤΥΤΕΡΑ|ΠΥΑ|ΠΥΞ|ΠΥΟ|ΠΥΡ|ΠΧ|ΠΩ|ΠΩΛ|ΠΩΣ|ΡΑ|ΡΑΙ|ΡΑΠ|ΡΑΣ|ΡΕ|ΡΕΑ|ΡΕΕ|ΡΕΙ|' + . 'ΡΗΣ|ΡΘΩ|ΡΙΟ|ΡΟ|ΡΟΐ|ΡΟΕ|ΡΟΖ|ΡΟΗ|ΡΟΘ|ΡΟΙ|ΡΟΚ|ΡΟΛ|ΡΟΝ|ΡΟΣ|ΡΟΥ|ΣΑΙ|ΣΑΝ|ΣΑΟ|ΣΑΣ|ΣΕ|ΣΕΙΣ|ΣΕΚ|ΣΕΞ|ΣΕΡ|ΣΕΤ|ΣΕΦ|ΣΗΜΕΡΑ|ΣΙ|ΣΙΑ|ΣΙΓΑ|ΣΙΚ|' + . 'ΣΙΧ|ΣΚΙ|ΣΟΙ|ΣΟΚ|ΣΟΛ|ΣΟΝ|ΣΟΣ|ΣΟΥ|ΣΡΙ|ΣΤΑ|ΣΤΗ|ΣΤΗΝ|ΣΤΗΣ|ΣΤΙΣ|ΣΤΟ|ΣΤΟΝ|ΣΤΟΥ|ΣΤΟΥΣ|ΣΤΩΝ|ΣΥ|ΣΥΓΧΡΟΝΩΣ|ΣΥΝ|ΣΥΝΑΜΑ|ΣΥΝΕΠΩΣ|ΣΥΝΗΘΩΣ|' + . 'ΣΧΕΔΟΝ|ΣΩΣΤΑ|ΤΑ|ΤΑΔΕ|ΤΑΚ|ΤΑΝ|ΤΑΟ|ΤΑΥ|ΤΑΧΑ|ΤΑΧΑΤΕ|ΤΕ|ΤΕΙ|ΤΕΛ|ΤΕΛΙΚΑ|ΤΕΛΙΚΩΣ|ΤΕΣ|ΤΕΤ|ΤΖΟ|ΤΗ|ΤΗΛ|ΤΗΝ|ΤΗΣ|ΤΙ|ΤΙΚ|ΤΙΜ|ΤΙΠΟΤΑ|ΤΙΠΟΤΕ|' + . 'ΤΙΣ|ΤΝΤ|ΤΟ|ΤΟΙ|ΤΟΚ|ΤΟΜ|ΤΟΝ|ΤΟΠ|ΤΟΣ|ΤΟΣ?Ν|ΤΟΣΑ|ΤΟΣΕΣ|ΤΟΣΗ|ΤΟΣΗΝ|ΤΟΣΗΣ|ΤΟΣΟ|ΤΟΣΟΙ|ΤΟΣΟΝ|ΤΟΣΟΣ|ΤΟΣΟΥ|ΤΟΣΟΥΣ|ΤΟΤΕ|ΤΟΥ|ΤΟΥΛΑΧΙΣΤΟ|' + . 'ΤΟΥΛΑΧΙΣΤΟΝ|ΤΟΥΣ|ΤΣ|ΤΣΑ|ΤΣΕ|ΤΥΧΟΝ|ΤΩ|ΤΩΝ|ΤΩΡΑ|ΥΑΣ|ΥΒΑ|ΥΒΟ|ΥΙΕ|ΥΙΟ|ΥΛΑ|ΥΛΗ|ΥΝΙ|ΥΠ|ΥΠΕΡ|ΥΠΟ|ΥΠΟΨΗ|ΥΠΟΨΙΝ|ΥΣΤΕΡΑ|ΥΦΗ|ΥΨΗ|ΦΑ|ΦΑΐ|ΦΑΕ|' + . 'ΦΑΝ|ΦΑΞ|ΦΑΣ|ΦΑΩ|ΦΕΖ|ΦΕΙ|ΦΕΤΟΣ|ΦΕΥ|ΦΙ|ΦΙΛ|ΦΙΣ|ΦΟΞ|ΦΠΑ|ΦΡΙ|ΧΑ|ΧΑΗ|ΧΑΛ|ΧΑΝ|ΧΑΦ|ΧΕ|ΧΕΙ|ΧΘΕΣ|ΧΙ|ΧΙΑ|ΧΙΛ|ΧΙΟ|ΧΛΜ|ΧΜ|ΧΟΗ|ΧΟΛ|ΧΡΩ|ΧΤΕΣ|' + . 'ΧΩΡΙΣ|ΧΩΡΙΣΤΑ|ΨΕΣ|ΨΗΛΑ|ΨΙ|ΨΙΤ|Ω|ΩΑ|ΩΑΣ|ΩΔΕ|ΩΕΣ|ΩΘΩ|ΩΜΑ|ΩΜΕ|ΩΝ|ΩΟ|ΩΟΝ|ΩΟΥ|ΩΣ|ΩΣΑΝ|ΩΣΗ|ΩΣΟΤΟΥ|ΩΣΠΟΥ|ΩΣΤΕ|ΩΣΤΟΣΟ|ΩΤΑ|ΩΧ|ΩΩΝ)$/'; + + if (preg_match($stop_words, $token)) + { + return $this->toLowerCase($token, $w_CASE); + } + + // Vowels + $v = '(Α|Ε|Η|Ι|Ο|Υ|Ω)'; + + // Vowels without Y + $v2 = '(Α|Ε|Η|Ι|Ο|Ω)'; + + $test1 = true; + + // Step S1. 14 stems + $re = '/^(.+?)(ΙΖΑ|ΙΖΕΣ|ΙΖΕ|ΙΖΑΜΕ|ΙΖΑΤΕ|ΙΖΑΝ|ΙΖΑΝΕ|ΙΖΩ|ΙΖΕΙΣ|ΙΖΕΙ|ΙΖΟΥΜΕ|ΙΖΕΤΕ|ΙΖΟΥΝ|ΙΖΟΥΝΕ)$/'; + $exceptS1 = '/^(ΑΝΑΜΠΑ|ΕΜΠΑ|ΕΠΑ|ΞΑΝΑΠΑ|ΠΑ|ΠΕΡΙΠΑ|ΑΘΡΟ|ΣΥΝΑΘΡΟ|ΔΑΝΕ)$/'; + $exceptS2 = '/^(ΜΑΡΚ|ΚΟΡΝ|ΑΜΠΑΡ|ΑΡΡ|ΒΑΘΥΡΙ|ΒΑΡΚ|Β|ΒΟΛΒΟΡ|ΓΚΡ|ΓΛΥΚΟΡ|ΓΛΥΚΥΡ|ΙΜΠ|Λ|ΛΟΥ|ΜΑΡ|Μ|ΠΡ|ΜΠΡ|ΠΟΛΥΡ|Π|Ρ|ΠΙΠΕΡΟΡ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + + if (preg_match($exceptS1, $token)) + { + $token = $token . 'I'; + } + + if (preg_match($exceptS2, $token)) + { + $token = $token . 'IΖ'; + } + + return $this->toLowerCase($token, $w_CASE); + } + + // Step S2. 7 stems + $re = '/^(.+?)(ΩΘΗΚΑ|ΩΘΗΚΕΣ|ΩΘΗΚΕ|ΩΘΗΚΑΜΕ|ΩΘΗΚΑΤΕ|ΩΘΗΚΑΝ|ΩΘΗΚΑΝΕ)$/'; + $exceptS1 = '/^(ΑΛ|ΒΙ|ΕΝ|ΥΨ|ΛΙ|ΖΩ|Σ|Χ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + + if (preg_match($exceptS1, $token)) + { + $token = $token . 'ΩΝ'; + } + + return $this->toLowerCase($token, $w_CASE); + } + + // Step S3. 7 stems + $re = '/^(.+?)(ΙΣΑ|ΙΣΕΣ|ΙΣΕ|ΙΣΑΜΕ|ΙΣΑΤΕ|ΙΣΑΝ|ΙΣΑΝΕ)$/'; + $exceptS1 = '/^(ΑΝΑΜΠΑ|ΑΘΡΟ|ΕΜΠΑ|ΕΣΕ|ΕΣΩΚΛΕ|ΕΠΑ|ΞΑΝΑΠΑ|ΕΠΕ|ΠΕΡΙΠΑ|ΑΘΡΟ|ΣΥΝΑΘΡΟ|ΔΑΝΕ|ΚΛΕ|ΧΑΡΤΟΠΑ|ΕΞΑΡΧΑ|ΜΕΤΕΠΕ|ΑΠΟΚΛΕ|ΑΠΕΚΛΕ|ΕΚΛΕ|ΠΕ|ΠΕΡΙΠΑ)$/'; + $exceptS2 = '/^(ΑΝ|ΑΦ|ΓΕ|ΓΙΓΑΝΤΟΑΦ|ΓΚΕ|ΔΗΜΟΚΡΑΤ|ΚΟΜ|ΓΚ|Μ|Π|ΠΟΥΚΑΜ|ΟΛΟ|ΛΑΡ)$/'; + + if ($token == "ΙΣΑ") + { + $token = "ΙΣ"; + + return $token; + } + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + + if (preg_match($exceptS1, $token)) + { + $token = $token . 'Ι'; + } + + if (preg_match($exceptS2, $token)) + { + $token = $token . 'ΙΣ'; + } + + return $this->toLowerCase($token, $w_CASE); + } + + + // Step S4. 7 stems + $re = '/^(.+?)(ΙΣΩ|ΙΣΕΙΣ|ΙΣΕΙ|ΙΣΟΥΜΕ|ΙΣΕΤΕ|ΙΣΟΥΝ|ΙΣΟΥΝΕ)$/'; + $exceptS1 = '/^(ΑΝΑΜΠΑ|ΕΜΠΑ|ΕΣΕ|ΕΣΩΚΛΕ|ΕΠΑ|ΞΑΝΑΠΑ|ΕΠΕ|ΠΕΡΙΠΑ|ΑΘΡΟ|ΣΥΝΑΘΡΟ|ΔΑΝΕ|ΚΛΕ|ΧΑΡΤΟΠΑ|ΕΞΑΡΧΑ|ΜΕΤΕΠΕ|ΑΠΟΚΛΕ|ΑΠΕΚΛΕ|ΕΚΛΕ|ΠΕ|ΠΕΡΙΠΑ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + + if (preg_match($exceptS1, $token)) + { + $token = $token . 'Ι'; + } + + return $this->toLowerCase($token, $w_CASE); + } + + // Step S5. 11 stems + $re = '/^(.+?)(ΙΣΤΟΣ|ΙΣΤΟΥ|ΙΣΤΟ|ΙΣΤΕ|ΙΣΤΟΙ|ΙΣΤΩΝ|ΙΣΤΟΥΣ|ΙΣΤΗ|ΙΣΤΗΣ|ΙΣΤΑ|ΙΣΤΕΣ)$/'; + $exceptS1 = '/^(Μ|Π|ΑΠ|ΑΡ|ΗΔ|ΚΤ|ΣΚ|ΣΧ|ΥΨ|ΦΑ|ΧΡ|ΧΤ|ΑΚΤ|ΑΟΡ|ΑΣΧ|ΑΤΑ|ΑΧΝ|ΑΧΤ|ΓΕΜ|ΓΥΡ|ΕΜΠ|ΕΥΠ|ΕΧΘ|ΗΦΑ|ΚΑΘ|ΚΑΚ|ΚΥΛ|ΛΥΓ|ΜΑΚ|ΜΕΓ|ΤΑΧ|ΦΙΛ|ΧΩΡ)$/'; + $exceptS2 = '/^(ΔΑΝΕ|ΣΥΝΑΘΡΟ|ΚΛΕ|ΣΕ|ΕΣΩΚΛΕ|ΑΣΕ|ΠΛΕ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + + if (preg_match($exceptS1, $token)) + { + $token = $token . 'ΙΣΤ'; + } + + if (preg_match($exceptS2, $token)) + { + $token = $token . 'Ι'; + } + + return $this->toLowerCase($token, $w_CASE); + } + + // Step S6. 6 stems + $re = '/^(.+?)(ΙΣΜΟ|ΙΣΜΟΙ|ΙΣΜΟΣ|ΙΣΜΟΥ|ΙΣΜΟΥΣ|ΙΣΜΩΝ)$/'; + $exceptS1 = '/^(ΑΓΝΩΣΤΙΚ|ΑΤΟΜΙΚ|ΓΝΩΣΤΙΚ|ΕΘΝΙΚ|ΕΚΛΕΚΤΙΚ|ΣΚΕΠΤΙΚ|ΤΟΠΙΚ)$/'; + $exceptS2 = '/^(ΣΕ|ΜΕΤΑΣΕ|ΜΙΚΡΟΣΕ|ΕΓΚΛΕ|ΑΠΟΚΛΕ)$/'; + $exceptS3 = '/^(ΔΑΝΕ|ΑΝΤΙΔΑΝΕ)$/'; + $exceptS4 = '/^(ΑΛΕΞΑΝΔΡΙΝ|ΒΥΖΑΝΤΙΝ|ΘΕΑΤΡΙΝ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + + if (preg_match($exceptS1, $token)) + { + $token = str_replace('ΙΚ', "", $token); + } + + if (preg_match($exceptS2, $token)) + { + $token = $token . "ΙΣΜ"; + } + + if (preg_match($exceptS3, $token)) + { + $token = $token . "Ι"; + } + + if (preg_match($exceptS4, $token)) + { + $token = str_replace('ΙΝ', "", $token); + } + + return $this->toLowerCase($token, $w_CASE); + } + + // Step S7. 4 stems + $re = '/^(.+?)(ΑΡΑΚΙ|ΑΡΑΚΙΑ|ΟΥΔΑΚΙ|ΟΥΔΑΚΙΑ)$/'; + $exceptS1 = '/^(Σ|Χ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + + if (preg_match($exceptS1, $token)) + { + $token = $token . "AΡΑΚ"; + } + + return $this->toLowerCase($token, $w_CASE); + } + + + // Step S8. 8 stems + $re = '/^(.+?)(ΑΚΙ|ΑΚΙΑ|ΙΤΣΑ|ΙΤΣΑΣ|ΙΤΣΕΣ|ΙΤΣΩΝ|ΑΡΑΚΙ|ΑΡΑΚΙΑ)$/'; + $exceptS1 = '/^(ΑΝΘΡ|ΒΑΜΒ|ΒΡ|ΚΑΙΜ|ΚΟΝ|ΚΟΡ|ΛΑΒΡ|ΛΟΥΛ|ΜΕΡ|ΜΟΥΣΤ|ΝΑΓΚΑΣ|ΠΛ|Ρ|ΡΥ|Σ|ΣΚ|ΣΟΚ|ΣΠΑΝ|ΤΖ|ΦΑΡΜ|Χ|' + . 'ΚΑΠΑΚ|ΑΛΙΣΦ|ΑΜΒΡ|ΑΝΘΡ|Κ|ΦΥΛ|ΚΑΤΡΑΠ|ΚΛΙΜ|ΜΑΛ|ΣΛΟΒ|Φ|ΣΦ|ΤΣΕΧΟΣΛΟΒ)$/'; + $exceptS2 = '/^(Β|ΒΑΛ|ΓΙΑΝ|ΓΛ|Ζ|ΗΓΟΥΜΕΝ|ΚΑΡΔ|ΚΟΝ|ΜΑΚΡΥΝ|ΝΥΦ|ΠΑΤΕΡ|Π|ΣΚ|ΤΟΣ|ΤΡΙΠΟΛ)$/'; + + // For words like ΠΛΟΥΣΙΟΚΟΡΙΤΣΑ, ΠΑΛΙΟΚΟΡΙΤΣΑ etc + $exceptS3 = '/(ΚΟΡ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + + if (preg_match($exceptS1, $token)) + { + $token = $token . "ΑΚ"; + } + + if (preg_match($exceptS2, $token)) + { + $token = $token . "ΙΤΣ"; + } + + if (preg_match($exceptS3, $token)) + { + $token = $token . "ΙΤΣ"; + } + + return $this->toLowerCase($token, $w_CASE); + } + + // Step S9. 3 stems + $re = '/^(.+?)(ΙΔΙΟ|ΙΔΙΑ|ΙΔΙΩΝ)$/'; + $exceptS1 = '/^(ΑΙΦΝ|ΙΡ|ΟΛΟ|ΨΑΛ)$/'; + $exceptS2 = '/(Ε|ΠΑΙΧΝ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + + if (preg_match($exceptS1, $token)) + { + $token = $token . "ΙΔ"; + } + + if (preg_match($exceptS2, $token)) + { + $token = $token . "ΙΔ"; + } + + return $this->toLowerCase($token, $w_CASE); + } + + // Step S10. 4 stems + $re = '/^(.+?)(ΙΣΚΟΣ|ΙΣΚΟΥ|ΙΣΚΟ|ΙΣΚΕ)$/'; + $exceptS1 = '/^(Δ|ΙΒ|ΜΗΝ|Ρ|ΦΡΑΓΚ|ΛΥΚ|ΟΒΕΛ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + + if (preg_match($exceptS1, $token)) + { + $token = $token . "ΙΣΚ"; + } + + return $this->toLowerCase($token, $w_CASE); + } + + // Step 1 + // step1list is used in Step 1. 41 stems + $step1list = Array(); + $step1list["ΦΑΓΙΑ"] = "ΦΑ"; + $step1list["ΦΑΓΙΟΥ"] = "ΦΑ"; + $step1list["ΦΑΓΙΩΝ"] = "ΦΑ"; + $step1list["ΣΚΑΓΙΑ"] = "ΣΚΑ"; + $step1list["ΣΚΑΓΙΟΥ"] = "ΣΚΑ"; + $step1list["ΣΚΑΓΙΩΝ"] = "ΣΚΑ"; + $step1list["ΟΛΟΓΙΟΥ"] = "ΟΛΟ"; + $step1list["ΟΛΟΓΙΑ"] = "ΟΛΟ"; + $step1list["ΟΛΟΓΙΩΝ"] = "ΟΛΟ"; + $step1list["ΣΟΓΙΟΥ"] = "ΣΟ"; + $step1list["ΣΟΓΙΑ"] = "ΣΟ"; + $step1list["ΣΟΓΙΩΝ"] = "ΣΟ"; + $step1list["ΤΑΤΟΓΙΑ"] = "ΤΑΤΟ"; + $step1list["ΤΑΤΟΓΙΟΥ"] = "ΤΑΤΟ"; + $step1list["ΤΑΤΟΓΙΩΝ"] = "ΤΑΤΟ"; + $step1list["ΚΡΕΑΣ"] = "ΚΡΕ"; + $step1list["ΚΡΕΑΤΟΣ"] = "ΚΡΕ"; + $step1list["ΚΡΕΑΤΑ"] = "ΚΡΕ"; + $step1list["ΚΡΕΑΤΩΝ"] = "ΚΡΕ"; + $step1list["ΠΕΡΑΣ"] = "ΠΕΡ"; + $step1list["ΠΕΡΑΤΟΣ"] = "ΠΕΡ"; + + // Added by Spyros. Also at $re in step1 + $step1list["ΠΕΡΑΤΗ"] = "ΠΕΡ"; + $step1list["ΠΕΡΑΤΑ"] = "ΠΕΡ"; + $step1list["ΠΕΡΑΤΩΝ"] = "ΠΕΡ"; + $step1list["ΤΕΡΑΣ"] = "ΤΕΡ"; + $step1list["ΤΕΡΑΤΟΣ"] = "ΤΕΡ"; + $step1list["ΤΕΡΑΤΑ"] = "ΤΕΡ"; + $step1list["ΤΕΡΑΤΩΝ"] = "ΤΕΡ"; + $step1list["ΦΩΣ"] = "ΦΩ"; + $step1list["ΦΩΤΟΣ"] = "ΦΩ"; + $step1list["ΦΩΤΑ"] = "ΦΩ"; + $step1list["ΦΩΤΩΝ"] = "ΦΩ"; + $step1list["ΚΑΘΕΣΤΩΣ"] = "ΚΑΘΕΣΤ"; + $step1list["ΚΑΘΕΣΤΩΤΟΣ"] = "ΚΑΘΕΣΤ"; + $step1list["ΚΑΘΕΣΤΩΤΑ"] = "ΚΑΘΕΣΤ"; + $step1list["ΚΑΘΕΣΤΩΤΩΝ"] = "ΚΑΘΕΣΤ"; + $step1list["ΓΕΓΟΝΟΣ"] = "ΓΕΓΟΝ"; + $step1list["ΓΕΓΟΝΟΤΟΣ"] = "ΓΕΓΟΝ"; + $step1list["ΓΕΓΟΝΟΤΑ"] = "ΓΕΓΟΝ"; + $step1list["ΓΕΓΟΝΟΤΩΝ"] = "ΓΕΓΟΝ"; + + $re = '/(.*)(ΦΑΓΙΑ|ΦΑΓΙΟΥ|ΦΑΓΙΩΝ|ΣΚΑΓΙΑ|ΣΚΑΓΙΟΥ|ΣΚΑΓΙΩΝ|ΟΛΟΓΙΟΥ|ΟΛΟΓΙΑ|ΟΛΟΓΙΩΝ|ΣΟΓΙΟΥ|ΣΟΓΙΑ|ΣΟΓΙΩΝ|ΤΑΤΟΓΙΑ|ΤΑΤΟΓΙΟΥ|ΤΑΤΟΓΙΩΝ|ΚΡΕΑΣ|ΚΡΕΑΤΟΣ|' + . 'ΚΡΕΑΤΑ|ΚΡΕΑΤΩΝ|ΠΕΡΑΣ|ΠΕΡΑΤΟΣ|ΠΕΡΑΤΗ|ΠΕΡΑΤΑ|ΠΕΡΑΤΩΝ|ΤΕΡΑΣ|ΤΕΡΑΤΟΣ|ΤΕΡΑΤΑ|ΤΕΡΑΤΩΝ|ΦΩΣ|ΦΩΤΟΣ|ΦΩΤΑ|ΦΩΤΩΝ|ΚΑΘΕΣΤΩΣ|ΚΑΘΕΣΤΩΤΟΣ|' + . 'ΚΑΘΕΣΤΩΤΑ|ΚΑΘΕΣΤΩΤΩΝ|ΓΕΓΟΝΟΣ|ΓΕΓΟΝΟΤΟΣ|ΓΕΓΟΝΟΤΑ|ΓΕΓΟΝΟΤΩΝ)$/'; + + if (preg_match($re, $token, $match)) + { + $stem = $match[1]; + $suffix = $match[2]; + $token = $stem . (array_key_exists($suffix, $step1list) ? $step1list[$suffix] : ''); + $test1 = false; + } + + // Step 2a. 2 stems + $re = '/^(.+?)(ΑΔΕΣ|ΑΔΩΝ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1]; + $re = '/(ΟΚ|ΜΑΜ|ΜΑΝ|ΜΠΑΜΠ|ΠΑΤΕΡ|ΓΙΑΓΙ|ΝΤΑΝΤ|ΚΥΡ|ΘΕΙ|ΠΕΘΕΡ)$/'; + + if (!preg_match($re, $token)) + { + $token = $token . "ΑΔ"; + } + } + + // Step 2b. 2 stems + $re = '/^(.+?)(ΕΔΕΣ|ΕΔΩΝ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $exept2 = '/(ΟΠ|ΙΠ|ΕΜΠ|ΥΠ|ΓΗΠ|ΔΑΠ|ΚΡΑΣΠ|ΜΙΛ)$/'; + + if (preg_match($exept2, $token)) + { + $token = $token . 'ΕΔ'; + } + } + + // Step 2c + $re = '/^(.+?)(ΟΥΔΕΣ|ΟΥΔΩΝ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + + $exept3 = '/(ΑΡΚ|ΚΑΛΙΑΚ|ΠΕΤΑΛ|ΛΙΧ|ΠΛΕΞ|ΣΚ|Σ|ΦΛ|ΦΡ|ΒΕΛ|ΛΟΥΛ|ΧΝ|ΣΠ|ΤΡΑΓ|ΦΕ)$/'; + + if (preg_match($exept3, $token)) + { + $token = $token . 'ΟΥΔ'; + } + } + + // Step 2d + $re = '/^(.+?)(ΕΩΣ|ΕΩΝ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept4 = '/^(Θ|Δ|ΕΛ|ΓΑΛ|Ν|Π|ΙΔ|ΠΑΡ)$/'; + + if (preg_match($exept4, $token)) + { + $token = $token . 'Ε'; + } + + } + + // Step 3 + $re = '/^(.+?)(ΙΑ|ΙΟΥ|ΙΩΝ)$/'; + + if (preg_match($re, $token, $fp)) + { + $stem = $fp[1]; + $token = $stem; + $re = '/' . $v . '$/'; + $test1 = false; + + if (preg_match($re, $token)) + { + $token = $stem . 'Ι'; + } + } + + // Step 4 + $re = '/^(.+?)(ΙΚΑ|ΙΚΟ|ΙΚΟΥ|ΙΚΩΝ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $re = '/' . $v . '$/'; + $exept5 = '/^(ΑΛ|ΑΔ|ΕΝΔ|ΑΜΑΝ|ΑΜΜΟΧΑΛ|ΗΘ|ΑΝΗΘ|ΑΝΤΙΔ|ΦΥΣ|ΒΡΩΜ|ΓΕΡ|ΕΞΩΔ|ΚΑΛΠ|ΚΑΛΛΙΝ|ΚΑΤΑΔ|ΜΟΥΛ|ΜΠΑΝ|ΜΠΑΓΙΑΤ|ΜΠΟΛ|ΜΠΟΣ|ΝΙΤ|ΞΙΚ|ΣΥΝΟΜΗΛ|ΠΕΤΣ|' + . 'ΠΙΤΣ|ΠΙΚΑΝΤ|ΠΛΙΑΤΣ|ΠΟΣΤΕΛΝ|ΠΡΩΤΟΔ|ΣΕΡΤ|ΣΥΝΑΔ|ΤΣΑΜ|ΥΠΟΔ|ΦΙΛΟΝ|ΦΥΛΟΔ|ΧΑΣ)$/'; + + if (preg_match($re, $token) || preg_match($exept5, $token)) + { + $token = $token . 'ΙΚ'; + } + } + + // Step 5a + $re = '/^(.+?)(ΑΜΕ)$/'; + $re2 = '/^(.+?)(ΑΓΑΜΕ|ΗΣΑΜΕ|ΟΥΣΑΜΕ|ΗΚΑΜΕ|ΗΘΗΚΑΜΕ)$/'; + + if ($token == "ΑΓΑΜΕ") + { + $token = "ΑΓΑΜ"; + + } + + if (preg_match($re2, $token)) + { + preg_match($re2, $token, $match); + $token = $match[1]; + $test1 = false; + } + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept6 = '/^(ΑΝΑΠ|ΑΠΟΘ|ΑΠΟΚ|ΑΠΟΣΤ|ΒΟΥΒ|ΞΕΘ|ΟΥΛ|ΠΕΘ|ΠΙΚΡ|ΠΟΤ|ΣΙΧ|Χ)$/'; + + if (preg_match($exept6, $token)) + { + $token = $token . "ΑΜ"; + } + } + + // Step 5b + $re2 = '/^(.+?)(ΑΝΕ)$/'; + $re3 = '/^(.+?)(ΑΓΑΝΕ|ΗΣΑΝΕ|ΟΥΣΑΝΕ|ΙΟΝΤΑΝΕ|ΙΟΤΑΝΕ|ΙΟΥΝΤΑΝΕ|ΟΝΤΑΝΕ|ΟΤΑΝΕ|ΟΥΝΤΑΝΕ|ΗΚΑΝΕ|ΗΘΗΚΑΝΕ)$/'; + + if (preg_match($re3, $token)) + { + preg_match($re3, $token, $match); + $token = $match[1]; + $test1 = false; + $re3 = '/^(ΤΡ|ΤΣ)$/'; + + if (preg_match($re3, $token)) + { + $token = $token . "ΑΓΑΝ"; + } + } + + if (preg_match($re2, $token)) + { + preg_match($re2, $token, $match); + $token = $match[1]; + $test1 = false; + $re2 = '/' . $v2 . '$/'; + $exept7 = '/^(ΒΕΤΕΡ|ΒΟΥΛΚ|ΒΡΑΧΜ|Γ|ΔΡΑΔΟΥΜ|Θ|ΚΑΛΠΟΥΖ|ΚΑΣΤΕΛ|ΚΟΡΜΟΡ|ΛΑΟΠΛ|ΜΩΑΜΕΘ|Μ|ΜΟΥΣΟΥΛΜ|Ν|ΟΥΛ|Π|ΠΕΛΕΚ|ΠΛ|ΠΟΛΙΣ|ΠΟΡΤΟΛ|ΣΑΡΑΚΑΤΣ|ΣΟΥΛΤ|' + . 'ΤΣΑΡΛΑΤ|ΟΡΦ|ΤΣΙΓΓ|ΤΣΟΠ|ΦΩΤΟΣΤΕΦ|Χ|ΨΥΧΟΠΛ|ΑΓ|ΟΡΦ|ΓΑΛ|ΓΕΡ|ΔΕΚ|ΔΙΠΛ|ΑΜΕΡΙΚΑΝ|ΟΥΡ|ΠΙΘ|ΠΟΥΡΙΤ|Σ|ΖΩΝΤ|ΙΚ|ΚΑΣΤ|ΚΟΠ|ΛΙΧ|ΛΟΥΘΗΡ|ΜΑΙΝΤ|' + . 'ΜΕΛ|ΣΙΓ|ΣΠ|ΣΤΕΓ|ΤΡΑΓ|ΤΣΑΓ|Φ|ΕΡ|ΑΔΑΠ|ΑΘΙΓΓ|ΑΜΗΧ|ΑΝΙΚ|ΑΝΟΡΓ|ΑΠΗΓ|ΑΠΙΘ|ΑΤΣΙΓΓ|ΒΑΣ|ΒΑΣΚ|ΒΑΘΥΓΑΛ|ΒΙΟΜΗΧ|ΒΡΑΧΥΚ|ΔΙΑΤ|ΔΙΑΦ|ΕΝΟΡΓ|' + . 'ΘΥΣ|ΚΑΠΝΟΒΙΟΜΗΧ|ΚΑΤΑΓΑΛ|ΚΛΙΒ|ΚΟΙΛΑΡΦ|ΛΙΒ|ΜΕΓΛΟΒΙΟΜΗΧ|ΜΙΚΡΟΒΙΟΜΗΧ|ΝΤΑΒ|ΞΗΡΟΚΛΙΒ|ΟΛΙΓΟΔΑΜ|ΟΛΟΓΑΛ|ΠΕΝΤΑΡΦ|ΠΕΡΗΦ|ΠΕΡΙΤΡ|ΠΛΑΤ|' + . 'ΠΟΛΥΔΑΠ|ΠΟΛΥΜΗΧ|ΣΤΕΦ|ΤΑΒ|ΤΕΤ|ΥΠΕΡΗΦ|ΥΠΟΚΟΠ|ΧΑΜΗΛΟΔΑΠ|ΨΗΛΟΤΑΒ)$/'; + + + if (preg_match($re2, $token) || preg_match($exept7, $token)) + { + $token = $token . "ΑΝ"; + } + } + + // Step 5c + $re3 = '/^(.+?)(ΕΤΕ)$/'; + $re4 = '/^(.+?)(ΗΣΕΤΕ)$/'; + + if (preg_match($re4, $token)) + { + preg_match($re4, $token, $match); + $token = $match[1]; + $test1 = false; + } + + if (preg_match($re3, $token)) + { + preg_match($re3, $token, $match); + $token = $match[1]; + $test1 = false; + $re3 = '/' . $v2 . '$/'; + $exept8 = '/(ΟΔ|ΑΙΡ|ΦΟΡ|ΤΑΘ|ΔΙΑΘ|ΣΧ|ΕΝΔ|ΕΥΡ|ΤΙΘ|ΥΠΕΡΘ|ΡΑΘ|ΕΝΘ|ΡΟΘ|ΣΘ|ΠΥΡ|ΑΙΝ|ΣΥΝΔ|ΣΥΝ|ΣΥΝΘ|ΧΩΡ|ΠΟΝ|ΒΡ|ΚΑΘ|ΕΥΘ|ΕΚΘ|ΝΕΤ|ΡΟΝ|ΑΡΚ|ΒΑΡ|ΒΟΛ|ΩΦΕΛ)$/'; + $exept9 = '/^(ΑΒΑΡ|ΒΕΝ|ΕΝΑΡ|ΑΒΡ|ΑΔ|ΑΘ|ΑΝ|ΑΠΛ|ΒΑΡΟΝ|ΝΤΡ|ΣΚ|ΚΟΠ|ΜΠΟΡ|ΝΙΦ|ΠΑΓ|ΠΑΡΑΚΑΛ|ΣΕΡΠ|ΣΚΕΛ|ΣΥΡΦ|ΤΟΚ|Υ|Δ|ΕΜ|ΘΑΡΡ|Θ)$/'; + + if (preg_match($re3, $token) || preg_match($exept8, $token) || preg_match($exept9, $token)) + { + $token = $token . "ΕΤ"; + } + } + + // Step 5d + $re = '/^(.+?)(ΟΝΤΑΣ|ΩΝΤΑΣ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept10 = '/^(ΑΡΧ)$/'; + $exept11 = '/(ΚΡΕ)$/'; + + if (preg_match($exept10, $token)) + { + $token = $token . "ΟΝΤ"; + } + + if (preg_match($exept11, $token)) + { + $token = $token . "ΩΝΤ"; + } + } + + // Step 5e + $re = '/^(.+?)(ΟΜΑΣΤΕ|ΙΟΜΑΣΤΕ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept11 = '/^(ΟΝ)$/'; + + if (preg_match($exept11, $token)) + { + $token = $token . "ΟΜΑΣΤ"; + } + } + + // Step 5f + $re = '/^(.+?)(ΕΣΤΕ)$/'; + $re2 = '/^(.+?)(ΙΕΣΤΕ)$/'; + + if (preg_match($re2, $token)) + { + preg_match($re2, $token, $match); + $token = $match[1]; + $test1 = false; + $re2 = '/^(Π|ΑΠ|ΣΥΜΠ|ΑΣΥΜΠ|ΑΚΑΤΑΠ|ΑΜΕΤΑΜΦ)$/'; + + if (preg_match($re2, $token)) + { + $token = $token . "ΙΕΣΤ"; + } + } + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept12 = '/^(ΑΛ|ΑΡ|ΕΚΤΕΛ|Ζ|Μ|Ξ|ΠΑΡΑΚΑΛ|ΑΡ|ΠΡΟ|ΝΙΣ)$/'; + + if (preg_match($exept12, $token)) + { + $token = $token . "ΕΣΤ"; + } + } + + // Step 5g + $re = '/^(.+?)(ΗΚΑ|ΗΚΕΣ|ΗΚΕ)$/'; + $re2 = '/^(.+?)(ΗΘΗΚΑ|ΗΘΗΚΕΣ|ΗΘΗΚΕ)$/'; + + if (preg_match($re2, $token)) + { + preg_match($re2, $token, $match); + $token = $match[1]; + $test1 = false; + } + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept13 = '/(ΣΚΩΛ|ΣΚΟΥΛ|ΝΑΡΘ|ΣΦ|ΟΘ|ΠΙΘ)$/'; + $exept14 = '/^(ΔΙΑΘ|Θ|ΠΑΡΑΚΑΤΑΘ|ΠΡΟΣΘ|ΣΥΝΘ|)$/'; + + if (preg_match($exept13, $token) || preg_match($exept14, $token)) + { + $token = $token . "ΗΚ"; + } + } + + // Step 5h + $re = '/^(.+?)(ΟΥΣΑ|ΟΥΣΕΣ|ΟΥΣΕ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept15 = '/^(ΦΑΡΜΑΚ|ΧΑΔ|ΑΓΚ|ΑΝΑΡΡ|ΒΡΟΜ|ΕΚΛΙΠ|ΛΑΜΠΙΔ|ΛΕΧ|Μ|ΠΑΤ|Ρ|Λ|ΜΕΔ|ΜΕΣΑΖ|ΥΠΟΤΕΙΝ|ΑΜ|ΑΙΘ|ΑΝΗΚ|ΔΕΣΠΟΖ|ΕΝΔΙΑΦΕΡ|ΔΕ|ΔΕΥΤΕΡΕΥ|ΚΑΘΑΡΕΥ|ΠΛΕ|ΤΣΑ)$/'; + $exept16 = '/(ΠΟΔΑΡ|ΒΛΕΠ|ΠΑΝΤΑΧ|ΦΡΥΔ|ΜΑΝΤΙΛ|ΜΑΛΛ|ΚΥΜΑΤ|ΛΑΧ|ΛΗΓ|ΦΑΓ|ΟΜ|ΠΡΩΤ)$/'; + + if (preg_match($exept15, $token) || preg_match($exept16, $token)) + { + $token = $token . "ΟΥΣ"; + } + } + + // Step 5i + $re = '/^(.+?)(ΑΓΑ|ΑΓΕΣ|ΑΓΕ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept17 = '/^(ΨΟΦ|ΝΑΥΛΟΧ)$/'; + $exept20 = '/(ΚΟΛΛ)$/'; + $exept18 = '/^(ΑΒΑΣΤ|ΠΟΛΥΦ|ΑΔΗΦ|ΠΑΜΦ|Ρ|ΑΣΠ|ΑΦ|ΑΜΑΛ|ΑΜΑΛΛΙ|ΑΝΥΣΤ|ΑΠΕΡ|ΑΣΠΑΡ|ΑΧΑΡ|ΔΕΡΒΕΝ|ΔΡΟΣΟΠ|ΞΕΦ|ΝΕΟΠ|ΝΟΜΟΤ|ΟΛΟΠ|ΟΜΟΤ|ΠΡΟΣΤ|ΠΡΟΣΩΠΟΠ|' + . 'ΣΥΜΠ|ΣΥΝΤ|Τ|ΥΠΟΤ|ΧΑΡ|ΑΕΙΠ|ΑΙΜΟΣΤ|ΑΝΥΠ|ΑΠΟΤ|ΑΡΤΙΠ|ΔΙΑΤ|ΕΝ|ΕΠΙΤ|ΚΡΟΚΑΛΟΠ|ΣΙΔΗΡΟΠ|Λ|ΝΑΥ|ΟΥΛΑΜ|ΟΥΡ|Π|ΤΡ|Μ)$/'; + $exept19 = '/(ΟΦ|ΠΕΛ|ΧΟΡΤ|ΛΛ|ΣΦ|ΡΠ|ΦΡ|ΠΡ|ΛΟΧ|ΣΜΗΝ)$/'; + + if ((preg_match($exept18, $token) || preg_match($exept19, $token)) + && !(preg_match($exept17, $token) || preg_match($exept20, $token))) + { + $token = $token . "ΑΓ"; + } + } + + + // Step 5j + $re = '/^(.+?)(ΗΣΕ|ΗΣΟΥ|ΗΣΑ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept21 = '/^(Ν|ΧΕΡΣΟΝ|ΔΩΔΕΚΑΝ|ΕΡΗΜΟΝ|ΜΕΓΑΛΟΝ|ΕΠΤΑΝ)$/'; + + if (preg_match($exept21, $token)) + { + $token = $token . "ΗΣ"; + } + } + + // Step 5k + $re = '/^(.+?)(ΗΣΤΕ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept22 = '/^(ΑΣΒ|ΣΒ|ΑΧΡ|ΧΡ|ΑΠΛ|ΑΕΙΜΝ|ΔΥΣΧΡ|ΕΥΧΡ|ΚΟΙΝΟΧΡ|ΠΑΛΙΜΨ)$/'; + + if (preg_match($exept22, $token)) + { + $token = $token . "ΗΣΤ"; + } + } + + // Step 5l + $re = '/^(.+?)(ΟΥΝΕ|ΗΣΟΥΝΕ|ΗΘΟΥΝΕ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept23 = '/^(Ν|Ρ|ΣΠΙ|ΣΤΡΑΒΟΜΟΥΤΣ|ΚΑΚΟΜΟΥΤΣ|ΕΞΩΝ)$/'; + + if (preg_match($exept23, $token)) + { + $token = $token . "ΟΥΝ"; + } + } + + // Step 5m + $re = '/^(.+?)(ΟΥΜΕ|ΗΣΟΥΜΕ|ΗΘΟΥΜΕ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + $test1 = false; + $exept24 = '/^(ΠΑΡΑΣΟΥΣ|Φ|Χ|ΩΡΙΟΠΛ|ΑΖ|ΑΛΛΟΣΟΥΣ|ΑΣΟΥΣ)$/'; + + if (preg_match($exept24, $token)) + { + $token = $token . "ΟΥΜ"; + } + } + + // Step 6 + $re = '/^(.+?)(ΜΑΤΑ|ΜΑΤΩΝ|ΜΑΤΟΣ)$/'; + $re2 = '/^(.+?)(Α|ΑΓΑΤΕ|ΑΓΑΝ|ΑΕΙ|ΑΜΑΙ|ΑΝ|ΑΣ|ΑΣΑΙ|ΑΤΑΙ|ΑΩ|Ε|ΕΙ|ΕΙΣ|ΕΙΤΕ|ΕΣΑΙ|ΕΣ|ΕΤΑΙ|Ι|ΙΕΜΑΙ|ΙΕΜΑΣΤΕ|ΙΕΤΑΙ|ΙΕΣΑΙ|ΙΕΣΑΣΤΕ|ΙΟΜΑΣΤΑΝ|ΙΟΜΟΥΝ|' + . 'ΙΟΜΟΥΝΑ|ΙΟΝΤΑΝ|ΙΟΝΤΟΥΣΑΝ|ΙΟΣΑΣΤΑΝ|ΙΟΣΑΣΤΕ|ΙΟΣΟΥΝ|ΙΟΣΟΥΝΑ|ΙΟΤΑΝ|ΙΟΥΜΑ|ΙΟΥΜΑΣΤΕ|ΙΟΥΝΤΑΙ|ΙΟΥΝΤΑΝ|Η|ΗΔΕΣ|ΗΔΩΝ|ΗΘΕΙ|ΗΘΕΙΣ|ΗΘΕΙΤΕ|' + . 'ΗΘΗΚΑΤΕ|ΗΘΗΚΑΝ|ΗΘΟΥΝ|ΗΘΩ|ΗΚΑΤΕ|ΗΚΑΝ|ΗΣ|ΗΣΑΝ|ΗΣΑΤΕ|ΗΣΕΙ|ΗΣΕΣ|ΗΣΟΥΝ|ΗΣΩ|Ο|ΟΙ|ΟΜΑΙ|ΟΜΑΣΤΑΝ|ΟΜΟΥΝ|ΟΜΟΥΝΑ|ΟΝΤΑΙ|ΟΝΤΑΝ|ΟΝΤΟΥΣΑΝ|ΟΣ|' + . 'ΟΣΑΣΤΑΝ|ΟΣΑΣΤΕ|ΟΣΟΥΝ|ΟΣΟΥΝΑ|ΟΤΑΝ|ΟΥ|ΟΥΜΑΙ|ΟΥΜΑΣΤΕ|ΟΥΝ|ΟΥΝΤΑΙ|ΟΥΝΤΑΝ|ΟΥΣ|ΟΥΣΑΝ|ΟΥΣΑΤΕ|Υ|ΥΣ|Ω|ΩΝ)$/'; + + if (preg_match($re, $token, $match)) + { + $token = $match[1] . "ΜΑ"; + } + + if (preg_match($re2, $token) && $test1) + { + preg_match($re2, $token, $match); + $token = $match[1]; + } + + // Step 7 (ΠΑΡΑΘΕΤΙΚΑ) + $re = '/^(.+?)(ΕΣΤΕΡ|ΕΣΤΑΤ|ΟΤΕΡ|ΟΤΑΤ|ΥΤΕΡ|ΥΤΑΤ|ΩΤΕΡ|ΩΤΑΤ)$/'; + + if (preg_match($re, $token)) + { + preg_match($re, $token, $match); + $token = $match[1]; + } + + return $this->toLowerCase($token, $w_CASE); + } + + /** + * Converts the token to uppercase, suppressing accents and diaeresis. The array $w_CASE contains a special map of + * the uppercase rule used to convert each character at each position. + * + * @param string $token Token to process + * @param array &$w_CASE Map of uppercase rules + * + * @return string + * + * @since __DEPLOY_VERSION__ + */ + protected function toUpperCase($token, &$w_CASE) + { + $w_CASE = array_fill(0, mb_strlen($token, 'UTF-8'), 0); + $caseConvert = array( + "α" => 'Α', + "β" => 'Β', + "γ" => 'Γ', + "δ" => 'Δ', + "ε" => 'Ε', + "ζ" => 'Ζ', + "η" => 'Η', + "θ" => 'Θ', + "ι" => 'Ι', + "κ" => 'Κ', + "λ" => 'Λ', + "μ" => 'Μ', + "ν" => 'Ν', + "ξ" => 'Ξ', + "ο" => 'Ο', + "π" => 'Π', + "ρ" => 'Ρ', + "σ" => 'Σ', + "τ" => 'Τ', + "υ" => 'Υ', + "φ" => 'Φ', + "χ" => 'Χ', + "ψ" => 'Ψ', + "ω" => 'Ω', + "ά" => 'Α', + "έ" => 'Ε', + "ή" => 'Η', + "ί" => 'Ι', + "ό" => 'Ο', + "ύ" => 'Υ', + "ώ" => 'Ω', + "ς" => 'Σ', + "ϊ" => 'Ι', + "ϋ" => 'Ι', + "ΐ" => 'Ι', + "ΰ" => 'Υ', + ); + $newToken = ''; + + for ($i = 0; $i < mb_strlen($token); $i++) + { + $char = mb_substr($token, $i, 1); + $isLower = array_key_exists($char, $caseConvert); + + if (!$isLower) + { + $newToken .= $char; + + continue; + } + + $upperCase = $caseConvert[$char]; + $newToken .= $upperCase; + + $w_CASE[$i] = 1; + + if (in_array($char, ['ά', 'έ', 'ή', 'ί', 'ό', 'ύ', 'ώ', 'ς'])) + { + $w_CASE[$i] = 2; + } + + if (in_array($char, ['ϊ', 'ϋ'])) + { + $w_CASE[$i] = 3; + } + + if (in_array($char, ['ΐ', 'ΰ'])) + { + $w_CASE[$i] = 4; + } + } + + return $newToken; + } + + /** + * Converts the suppressed uppercase token back to lowercase, using the $w_CASE map to add back the accents, + * diaeresis and handle the special case of final sigma (different lowercase glyph than the regular sigma, only + * used at the end of words). + * + * @param string $token Token to process + * @param array $w_CASE Map of lowercase rules + * + * @return string + * + * @since __DEPLOY_VERSION__ + */ + protected function toLowerCase($token, $w_CASE) + { + $newToken = ''; + + for ($i = 0; $i < mb_strlen($token); $i++) + { + $char = mb_substr($token, $i, 1); + + // Is $w_CASE not set at this position? We assume no case conversion ever took place. + if (!isset($w_CASE[$i])) + { + $newToken .= $char; + + continue; + } + + // The character was not case-converted + if ($w_CASE[$i] == 0) + { + $newToken .= $char; + + continue; + } + + // Case 1: Unaccented letter + if ($w_CASE[$i] == 1) + { + $newToken .= mb_strtolower($char); + + continue; + } + + // Case 2: Vowel with accent (tonos); or the special case of final sigma + if ($w_CASE[$i] == 2) + { + $charMap = [ + 'Α' => 'ά', + 'Ε' => 'έ', + 'Η' => 'ή', + 'Ι' => 'ί', + 'Ο' => 'ό', + 'Υ' => 'ύ', + 'Ω' => 'ώ', + 'Σ' => 'ς' + ]; + + $newToken .= $charMap[$char]; + + continue; + } + + // Case 3: vowels with diaeresis (dialytika) + if ($w_CASE[$i] == 3) + { + $charMap = [ + 'Ι' => 'ϊ', + 'Υ' => 'ϋ' + ]; + + $newToken .= $charMap[$char]; + + continue; + } + + // Case 4: vowels with both diaeresis (dialytika) and accent (tonos) + if ($w_CASE[$i] == 4) + { + $charMap = [ + 'Ι' => 'ΐ', + 'Υ' => 'ΰ' + ]; + + $newToken .= $charMap[$char]; + + continue; + } + + // This should never happen! + $newToken .= $char; + } + + return $newToken; + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/en.php b/administrator/components/com_finder/helpers/indexer/language/en.php new file mode 100644 index 00000000000..43891ba0b2d --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/en.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\English; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/es.php b/administrator/components/com_finder/helpers/indexer/language/es.php new file mode 100644 index 00000000000..6ce871c64e8 --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/es.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\Spanish; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/fr.php b/administrator/components/com_finder/helpers/indexer/language/fr.php new file mode 100644 index 00000000000..5b28b92ebf3 --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/fr.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\French; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/it.php b/administrator/components/com_finder/helpers/indexer/language/it.php new file mode 100644 index 00000000000..44248c74f5c --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/it.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\Italian; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/nl.php b/administrator/components/com_finder/helpers/indexer/language/nl.php new file mode 100644 index 00000000000..c57a06a5568 --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/nl.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\Dutch; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/nn.php b/administrator/components/com_finder/helpers/indexer/language/nn.php new file mode 100644 index 00000000000..d996f25f56c --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/nn.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\Norwegian; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/pt.php b/administrator/components/com_finder/helpers/indexer/language/pt.php new file mode 100644 index 00000000000..5075c1ce612 --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/pt.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\Portuguese; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/ro.php b/administrator/components/com_finder/helpers/indexer/language/ro.php new file mode 100644 index 00000000000..954eb2bd8ba --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/ro.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\Romanian; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/ru.php b/administrator/components/com_finder/helpers/indexer/language/ru.php new file mode 100644 index 00000000000..87548c236f2 --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/ru.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\Russian; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/sv.php b/administrator/components/com_finder/helpers/indexer/language/sv.php new file mode 100644 index 00000000000..6cb9295fbeb --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/sv.php @@ -0,0 +1,58 @@ +stemmer = new \Wamania\Snowball\Swedish; + } + + /** + * Method to stem a token. + * + * @param string $token The token to stem. + * + * @return string The stemmed token. + * + * @since __DEPLOY_VERSION__ + */ + public function stem($token) + { + return $this->stemmer->stem($token); + } +} diff --git a/administrator/components/com_finder/helpers/indexer/language/zh.php b/administrator/components/com_finder/helpers/indexer/language/zh.php new file mode 100644 index 00000000000..c0797e54582 --- /dev/null +++ b/administrator/components/com_finder/helpers/indexer/language/zh.php @@ -0,0 +1,74 @@ +language = !empty($options['language']) ? $options['language'] : FinderIndexerHelper::getDefaultLanguage(); - $this->language = FinderIndexerHelper::getPrimaryLanguage($this->language); // Get the matching mode. $this->mode = 'AND'; @@ -995,7 +1002,7 @@ class FinderIndexerQuery { // Tokenize the current term. $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true); - $token = $this->getTokenData($token); + $token = $this->getTokenData(array_shift($token)); // Set the required flag. $token->required = true; @@ -1009,7 +1016,7 @@ class FinderIndexerQuery // Tokenize the term after the next term (current plus two). $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true); - $other = $this->getTokenData($other); + $other = $this->getTokenData(array_shift($other)); // Set the required flag. $other->required = true; @@ -1147,7 +1154,7 @@ class FinderIndexerQuery // Tokenize the next term (current plus one). $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true); - $other = $this->getTokenData($other); + $other = $this->getTokenData(array_shift($other)); // Set the required flag. $other->required = false; @@ -1187,7 +1194,7 @@ class FinderIndexerQuery { // Tokenize the phrase. $token = FinderIndexerHelper::tokenize($phrases[$i], $lang, true); - $token = $this->getTokenData($token); + $token = $this->getTokenData(array_shift($token)); // Set the required flag. $token->required = true; diff --git a/administrator/components/com_finder/helpers/indexer/stemmer.php b/administrator/components/com_finder/helpers/indexer/stemmer.php deleted file mode 100644 index 15ecc7cbb55..00000000000 --- a/administrator/components/com_finder/helpers/indexer/stemmer.php +++ /dev/null @@ -1,83 +0,0 @@ -clean($adapter, 'cmd'); - $path = __DIR__ . '/stemmer/' . $adapter . '.php'; - $class = 'FinderIndexerStemmer' . ucfirst($adapter); - - // Check if a stemmer exists for the adapter. - if (!file_exists($path)) - { - // Throw invalid adapter exception. - throw new Exception(JText::sprintf('COM_FINDER_INDEXER_INVALID_STEMMER', $adapter)); - } - - // Instantiate the stemmer. - JLoader::register($class, $path); - $instances[$adapter] = new $class; - - return $instances[$adapter]; - } - - /** - * Method to stem a token and return the root. - * - * @param string $token The token to stem. - * @param string $lang The language of the token. - * - * @return string The root token. - * - * @since 2.5 - */ - abstract public function stem($token, $lang); -} diff --git a/administrator/components/com_finder/helpers/indexer/stemmer/fr.php b/administrator/components/com_finder/helpers/indexer/stemmer/fr.php deleted file mode 100644 index 8aa080efc5f..00000000000 --- a/administrator/components/com_finder/helpers/indexer/stemmer/fr.php +++ /dev/null @@ -1,265 +0,0 @@ -cache[$lang][$token])) - { - // Stem the token. - $result = self::getStem($token); - - // Add the token to the cache. - $this->cache[$lang][$token] = $result; - } - - return $this->cache[$lang][$token]; - } - - /** - * French stemmer rules variables. - * - * @return array The rules - * - * @since 3.0 - */ - protected static function getStemRules() - { - if (self::$stemRules) - { - return self::$stemRules; - } - - $vars = array(); - - // French accented letters in ISO-8859-1 encoding - $vars['accents'] = chr(224) . chr(226) . chr(232) . chr(233) . chr(234) . chr(235) . chr(238) . chr(239) - . chr(244) . chr(251) . chr(249) . chr(231); - - // The rule patterns include all accented words for french language - $vars['rule_pattern'] = '/^([a-z' . $vars['accents'] . ']*)(\*){0,1}(\d)([a-z' . $vars['accents'] . ']*)([.|>])/'; - - // French vowels (including y) in ISO-8859-1 encoding - $vars['vowels'] = chr(97) . chr(224) . chr(226) . chr(101) . chr(232) . chr(233) . chr(234) . chr(235) - . chr(105) . chr(238) . chr(239) . chr(111) . chr(244) . chr(117) . chr(251) . chr(249) . chr(121); - - // The French rules in ISO-8859-1 encoding - $vars['rules'] = array( - 'esre1>', 'esio1>', 'siol1.', 'siof0.', 'sioe0.', 'sio3>', 'st1>', 'sf1>', 'sle1>', 'slo1>', 's' . chr(233) . '1>', chr(233) . 'tuae5.', - chr(233) . 'tuae2.', 'tnia0.', 'tniv1.', 'tni3>', 'suor1.', 'suo0.', 'sdrail5.', 'sdrai4.', 'er' . chr(232) . 'i1>', 'sesue3x>', - 'esuey5i.', 'esue2x>', 'se1>', 'er' . chr(232) . 'g3.', 'eca1>', 'esiah0.', 'esi1>', 'siss2.', 'sir2>', 'sit2>', 'egan' . chr(233) . '1.', - 'egalli6>', 'egass1.', 'egas0.', 'egat3.', 'ega3>', 'ette4>', 'ett2>', 'etio1.', 'tio' . chr(231) . '4c.', 'tio0.', 'et1>', 'eb1>', - 'snia1>', 'eniatnau8>', 'eniatn4.', 'enia1>', 'niatnio3.', 'niatg3.', 'e' . chr(233) . '1>', chr(233) . 'hcat1.', chr(233) . 'hca4.', - chr(233) . 'tila5>', chr(233) . 'tici5.', chr(233) . 'tir1.', chr(233) . 'ti3>', chr(233) . 'gan1.', chr(233) . 'ga3>', - chr(233) . 'tehc1.', chr(233) . 'te3>', chr(233) . 'it0.', chr(233) . '1>', 'eire4.', 'eirue5.', 'eio1.', 'eia1.', 'ei1>', 'eng1.', - 'xuaessi7.', 'xuae1>', 'uaes0.', 'uae3.', 'xuave2l.', 'xuav2li>', 'xua3la>', 'ela1>', 'lart2.', 'lani2>', 'la' . chr(233) . '2>', - 'siay4i.', 'siassia7.', 'siarv1*.', 'sia1>', 'tneiayo6i.', 'tneiay6i.', 'tneiassia9.', 'tneiareio7.', 'tneia5>', 'tneia4>', 'tiario4.', - 'tiarim3.', 'tiaria3.', 'tiaris3.', 'tiari5.', 'tiarve6>', 'tiare5>', 'iare4>', 'are3>', 'tiay4i.', 'tia3>', 'tnay4i.', - 'em' . chr(232) . 'iu5>', 'em' . chr(232) . 'i4>', 'tnaun3.', 'tnauqo3.', 'tnau4>', 'tnaf0.', 'tnat' . chr(233) . '2>', 'tna3>', 'tno3>', - 'zeiy4i.', 'zey3i.', 'zeire5>', 'zeird4.', 'zeirio4.', 'ze2>', 'ssiab0.', 'ssia4.', 'ssi3.', 'tnemma6>', 'tnemesuey9i.', 'tnemesue8>', - 'tnemevi7.', 'tnemessia5.', 'tnemessi8.', 'tneme5>', 'tnemia4.', 'tnem' . chr(233) . '5>', 'el2l>', 'lle3le>', 'let' . chr(244) . '0.', - 'lepp0.', 'le2>', 'srei1>', 'reit3.', 'reila2.', 'rei3>', 'ert' . chr(226) . 'e5.', 'ert' . chr(226) . chr(233) . '1.', - 'ert' . chr(226) . '4.', 'drai4.', 'erdro0.', 'erute5.', 'ruta0.', 'eruta1.', 'erutiov1.', 'erub3.', 'eruh3.', 'erul3.', 'er2r>', 'nn1>', - 'r' . chr(232) . 'i3.', 'srev0.', 'sr1>', 'rid2>', 're2>', 'xuei4.', 'esuei5.', 'lbati3.', 'lba3>', 'rueis0.', 'ruehcn4.', 'ecirta6.', - 'ruetai6.', 'rueta5.', 'rueir0.', 'rue3>', 'esseti6.', 'essere6>', 'esserd1.', 'esse4>', 'essiab1.', 'essia5.', 'essio1.', 'essi4.', - 'essal4.', 'essa1>', 'ssab1.', 'essurp1.', 'essu4.', 'essi1.', 'ssor1.', 'essor2.', 'esso1>', 'ess2>', 'tio3.', 'r' . chr(232) . 's2re.', - 'r' . chr(232) . '0e.', 'esn1.', 'eu1>', 'sua0.', 'su1>', 'utt1>', 'tu' . chr(231) . '3c.', 'u' . chr(231) . '2c.', 'ur1.', 'ehcn2>', - 'ehcu1>', 'snorr3.', 'snoru3.', 'snorua3.', 'snorv3.', 'snorio4.', 'snori5.', 'snore5>', 'snortt4>', 'snort' . chr(238) . 'a7.', 'snort3.', - 'snor4.', 'snossi6.', 'snoire6.', 'snoird5.', 'snoitai7.', 'snoita6.', 'snoits1>', 'noits0.', 'snoi4>', 'noitaci7>', 'noitai6.', 'noita5.', - 'noitu4.', 'noi3>', 'snoya0.', 'snoy4i.', 'sno' . chr(231) . 'a1.', 'sno' . chr(231) . 'r1.', 'snoe4.', 'snosiar1>', 'snola1.', 'sno3>', - 'sno1>', 'noll2.', 'tnennei4.', 'ennei2>', 'snei1>', 'sne' . chr(233) . '1>', 'enne' . chr(233) . '5e.', 'ne' . chr(233) . '3e.', 'neic0.', - 'neiv0.', 'nei3.', 'sc1.', 'sd1.', 'sg1.', 'sni1.', 'tiu0.', 'ti2.', 'sp1>', 'sna1>', 'sue1.', 'enn2>', 'nong2.', 'noss2.', 'rioe4.', - 'riot0.', 'riorc1.', 'riovec5.', 'rio3.', 'ric2.', 'ril2.', 'tnerim3.', 'tneris3>', 'tneri5.', 't' . chr(238) . 'a3.', 'riss2.', - 't' . chr(238) . '2.', 't' . chr(226) . '2>', 'ario2.', 'arim1.', 'ara1.', 'aris1.', 'ari3.', 'art1>', 'ardn2.', 'arr1.', 'arua1.', - 'aro1.', 'arv1.', 'aru1.', 'ar2.', 'rd1.', 'ud1.', 'ul1.', 'ini1.', 'rin2.', 'tnessiab3.', 'tnessia7.', 'tnessi6.', 'tnessni4.', 'sini2.', - 'sl1.', 'iard3.', 'iario3.', 'ia2>', 'io0.', 'iule2.', 'i1>', 'sid2.', 'sic2.', 'esoi4.', 'ed1.', 'ai2>', 'a1>', 'adr1.', - 'tner' . chr(232) . '5>', 'evir1.', 'evio4>', 'evi3.', 'fita4.', 'fi2>', 'enie1.', 'sare4>', 'sari4>', 'sard3.', 'sart2>', 'sa2.', - 'tnessa6>', 'tnessu6>', 'tnegna3.', 'tnegi3.', 'tneg0.', 'tneru5>', 'tnemg0.', 'tnerni4.', 'tneiv1.', 'tne3>', 'une1.', 'en1>', 'nitn2.', - 'ecnay5i.', 'ecnal1.', 'ecna4.', 'ec1>', 'nn1.', 'rit2>', 'rut2>', 'rud2.', 'ugn1>', 'eg1>', 'tuo0.', 'tul2>', 't' . chr(251) . '2>', - 'ev1>', 'v' . chr(232) . '2ve>', 'rtt1>', 'emissi6.', 'em1.', 'ehc1.', 'c' . chr(233) . 'i2c' . chr(232) . '.', 'libi2l.', 'llie1.', - 'liei4i.', 'xuev1.', 'xuey4i.', 'xueni5>', 'xuell4.', 'xuere5.', 'xue3>', 'rb' . chr(233) . '3rb' . chr(232) . '.', 'tur2.', - 'rir' . chr(233) . '4re.', 'rir2.', 'c' . chr(226) . '2ca.', 'snu1.', 'rt' . chr(238) . 'a4.', 'long2.', 'vec2.', chr(231) . '1c>', - 'ssilp3.', 'silp2.', 't' . chr(232) . 'hc2te.', 'n' . chr(232) . 'm2ne.', 'llepp1.', 'tan2.', 'rv' . chr(232) . '3rve.', - 'rv' . chr(233) . '3rve.', 'r' . chr(232) . '2re.', 'r' . chr(233) . '2re.', 't' . chr(232) . '2te.', 't' . chr(233) . '2te.', 'epp1.', - 'eya2i.', 'ya1i.', 'yo1i.', 'esu1.', 'ugi1.', 'tt1.', 'end0.' - ); - - self::$stemRules = $vars; - - return self::$stemRules; - } - - /** - * Returns the number of the first rule from the rule number - * that can be applied to the given reversed input. - * returns -1 if no rule can be applied, ie the stem has been found - * - * @param string $reversed_input The input to check in reversed order - * @param integer $rule_number The rule number to check - * - * @return integer Number of the first rule - * - * @since 3.0 - */ - private static function getFirstRule($reversed_input, $rule_number) - { - $vars = static::getStemRules(); - - $nb_rules = count($vars['rules']); - - for ($i = $rule_number; $i < $nb_rules; $i++) - { - // Gets the letters from the current rule - $rule = $vars['rules'][$i]; - $rule = preg_replace($vars['rule_pattern'], "\\1", $rule); - - if (strncasecmp(utf8_decode($rule), $reversed_input, strlen(utf8_decode($rule))) == 0) - { - return $i; - } - } - - return -1; - } - - /** - * Check the acceptability of a stem for French language - * - * @param string $reversed_stem The stem to check in reverse form - * - * @return boolean True if stem is acceptable - * - * @since 3.0 - */ - private static function check($reversed_stem) - { - $vars = static::getStemRules(); - - if (preg_match('/[' . $vars['vowels'] . ']$/', utf8_encode($reversed_stem))) - { - // If the form starts with a vowel then at least two letters must remain after stemming (e.g.: "etaient" --> "et") - return (strlen($reversed_stem) > 2); - } - else - { - // If the reversed stem starts with a consonant then at least two letters must remain after stemming - if (strlen($reversed_stem) <= 2) - { - return false; - } - - // And at least one of these must be a vowel or "y" - return preg_match('/[' . $vars['vowels'] . ']/', utf8_encode($reversed_stem)); - } - } - - /** - * Paice/Husk stemmer which returns a stem for the given $input - * - * @param string $input The word for which we want the stem in UTF-8 - * - * @return string The stem - * - * @since 3.0 - */ - private static function getStem($input) - { - $vars = static::getStemRules(); - - $intact = true; - $reversed_input = strrev(utf8_decode($input)); - $rule_number = 0; - - // This loop goes through the rules' array until it finds an ending one (ending by '.') or the last one ('end0.') - while (true) - { - $rule_number = self::getFirstRule($reversed_input, $rule_number); - - if ($rule_number == -1) - { - // No other rule can be applied => the stem has been found - break; - } - - $rule = $vars['rules'][$rule_number]; - preg_match($vars['rule_pattern'], $rule, $matches); - - if ($matches[2] != '*' || $intact) - { - $reversed_stem = utf8_decode($matches[4]) . substr($reversed_input, $matches[3], strlen($reversed_input) - $matches[3]); - - if (self::check($reversed_stem)) - { - $reversed_input = $reversed_stem; - - if ($matches[5] == '.') - { - break; - } - } - else - { - // Go to another rule - $rule_number++; - } - } - else - { - // Go to another rule - $rule_number++; - } - } - - return utf8_encode(strrev($reversed_input)); - } -} diff --git a/administrator/components/com_finder/helpers/indexer/stemmer/porter_en.php b/administrator/components/com_finder/helpers/indexer/stemmer/porter_en.php deleted file mode 100644 index e1649958f2b..00000000000 --- a/administrator/components/com_finder/helpers/indexer/stemmer/porter_en.php +++ /dev/null @@ -1,446 +0,0 @@ -cache[$lang][$token])) - { - // Stem the token. - $result = $token; - $result = self::step1ab($result); - $result = self::step1c($result); - $result = self::step2($result); - $result = self::step3($result); - $result = self::step4($result); - $result = self::step5($result); - - // Add the token to the cache. - $this->cache[$lang][$token] = $result; - } - - return $this->cache[$lang][$token]; - } - - /** - * Step 1 - * - * @param string $word The token to stem. - * - * @return string - * - * @since 2.5 - */ - private static function step1ab($word) - { - // Part a - if (substr($word, -1) == 's') - { - self::replace($word, 'sses', 'ss') - || self::replace($word, 'ies', 'i') - || self::replace($word, 'ss', 'ss') - || self::replace($word, 's', ''); - } - - // Part b - if (substr($word, -2, 1) != 'e' || !self::replace($word, 'eed', 'ee', 0)) - { - // First rule - $v = self::$regex_vowel; - - // Words ending with ing and ed - // Note use of && and OR, for precedence reasons - if (preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '') - || preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) - { - // If one of above two test successful - if (!self::replace($word, 'at', 'ate') && !self::replace($word, 'bl', 'ble') && !self::replace($word, 'iz', 'ize')) - { - // Double consonant ending - if (self::doubleConsonant($word) && substr($word, -2) != 'll' && substr($word, -2) != 'ss' && substr($word, -2) != 'zz') - { - $word = substr($word, 0, -1); - } - elseif (self::m($word) == 1 && self::cvc($word)) - { - $word .= 'e'; - } - } - } - } - - return $word; - } - - /** - * Step 1c - * - * @param string $word The token to stem. - * - * @return string - * - * @since 2.5 - */ - private static function step1c($word) - { - $v = self::$regex_vowel; - - if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) - { - self::replace($word, 'y', 'i'); - } - - return $word; - } - - /** - * Step 2 - * - * @param string $word The token to stem. - * - * @return string - * - * @since 2.5 - */ - private static function step2($word) - { - switch (substr($word, -2, 1)) - { - case 'a': - self::replace($word, 'ational', 'ate', 0) - || self::replace($word, 'tional', 'tion', 0); - break; - case 'c': - self::replace($word, 'enci', 'ence', 0) - || self::replace($word, 'anci', 'ance', 0); - break; - case 'e': - self::replace($word, 'izer', 'ize', 0); - break; - case 'g': - self::replace($word, 'logi', 'log', 0); - break; - case 'l': - self::replace($word, 'entli', 'ent', 0) - || self::replace($word, 'ousli', 'ous', 0) - || self::replace($word, 'alli', 'al', 0) - || self::replace($word, 'bli', 'ble', 0) - || self::replace($word, 'eli', 'e', 0); - break; - case 'o': - self::replace($word, 'ization', 'ize', 0) - || self::replace($word, 'ation', 'ate', 0) - || self::replace($word, 'ator', 'ate', 0); - break; - case 's': - self::replace($word, 'iveness', 'ive', 0) - || self::replace($word, 'fulness', 'ful', 0) - || self::replace($word, 'ousness', 'ous', 0) - || self::replace($word, 'alism', 'al', 0); - break; - case 't': - self::replace($word, 'biliti', 'ble', 0) - || self::replace($word, 'aliti', 'al', 0) - || self::replace($word, 'iviti', 'ive', 0); - break; - } - - return $word; - } - - /** - * Step 3 - * - * @param string $word The token to stem. - * - * @return string - * - * @since 2.5 - */ - private static function step3($word) - { - switch (substr($word, -2, 1)) - { - case 'a': - self::replace($word, 'ical', 'ic', 0); - break; - case 's': - self::replace($word, 'ness', '', 0); - break; - case 't': - self::replace($word, 'icate', 'ic', 0) - || self::replace($word, 'iciti', 'ic', 0); - break; - case 'u': - self::replace($word, 'ful', '', 0); - break; - case 'v': - self::replace($word, 'ative', '', 0); - break; - case 'z': - self::replace($word, 'alize', 'al', 0); - break; - } - - return $word; - } - - /** - * Step 4 - * - * @param string $word The token to stem. - * - * @return string - * - * @since 2.5 - */ - private static function step4($word) - { - switch (substr($word, -2, 1)) - { - case 'a': - self::replace($word, 'al', '', 1); - break; - case 'c': - self::replace($word, 'ance', '', 1) - || self::replace($word, 'ence', '', 1); - break; - case 'e': - self::replace($word, 'er', '', 1); - break; - case 'i': - self::replace($word, 'ic', '', 1); - break; - case 'l': - self::replace($word, 'able', '', 1) - || self::replace($word, 'ible', '', 1); - break; - case 'n': - self::replace($word, 'ant', '', 1) - || self::replace($word, 'ement', '', 1) - || self::replace($word, 'ment', '', 1) - || self::replace($word, 'ent', '', 1); - break; - case 'o': - if (substr($word, -4) == 'tion' || substr($word, -4) == 'sion') - { - self::replace($word, 'ion', '', 1); - } - else - { - self::replace($word, 'ou', '', 1); - } - break; - case 's': - self::replace($word, 'ism', '', 1); - break; - case 't': - self::replace($word, 'ate', '', 1) - || self::replace($word, 'iti', '', 1); - break; - case 'u': - self::replace($word, 'ous', '', 1); - break; - case 'v': - self::replace($word, 'ive', '', 1); - break; - case 'z': - self::replace($word, 'ize', '', 1); - break; - } - - return $word; - } - - /** - * Step 5 - * - * @param string $word The token to stem. - * - * @return string - * - * @since 2.5 - */ - private static function step5($word) - { - // Part a - if (substr($word, -1) == 'e') - { - if (self::m(substr($word, 0, -1)) > 1) - { - self::replace($word, 'e', ''); - } - elseif (self::m(substr($word, 0, -1)) == 1) - { - if (!self::cvc(substr($word, 0, -1))) - { - self::replace($word, 'e', ''); - } - } - } - - // Part b - if (self::m($word) > 1 && self::doubleConsonant($word) && substr($word, -1) == 'l') - { - $word = substr($word, 0, -1); - } - - return $word; - } - - /** - * Replaces the first string with the second, at the end of the string. If third - * arg is given, then the preceding string must match that m count at least. - * - * @param string &$str String to check - * @param string $check Ending to check for - * @param string $repl Replacement string - * @param integer $m Optional minimum number of m() to meet - * - * @return boolean Whether the $check string was at the end - * of the $str string. True does not necessarily mean - * that it was replaced. - * - * @since 2.5 - */ - private static function replace(&$str, $check, $repl, $m = null) - { - $len = 0 - strlen($check); - - if (substr($str, $len) == $check) - { - $substr = substr($str, 0, $len); - - if (is_null($m) || self::m($substr) > $m) - { - $str = $substr . $repl; - } - - return true; - } - - return false; - } - - /** - * m() measures the number of consonant sequences in $str. if c is - * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary - * presence, - * - * gives 0 - * vc gives 1 - * vcvc gives 2 - * vcvcvc gives 3 - * - * @param string $str The string to return the m count for - * - * @return integer The m count - * - * @since 2.5 - */ - private static function m($str) - { - $c = self::$regex_consonant; - $v = self::$regex_vowel; - - $str = preg_replace("#^$c+#", '', $str); - $str = preg_replace("#$v+$#", '', $str); - - preg_match_all("#($v+$c+)#", $str, $matches); - - return count($matches[1]); - } - - /** - * Returns true/false as to whether the given string contains two - * of the same consonant next to each other at the end of the string. - * - * @param string $str String to check - * - * @return boolean Result - * - * @since 2.5 - */ - private static function doubleConsonant($str) - { - $c = self::$regex_consonant; - - return preg_match("#$c{2}$#", $str, $matches) && $matches[0]{0} == $matches[0]{1}; - } - - /** - * Checks for ending CVC sequence where second C is not W, X or Y - * - * @param string $str String to check - * - * @return boolean Result - * - * @since 2.5 - */ - private static function cvc($str) - { - $c = self::$regex_consonant; - $v = self::$regex_vowel; - - return preg_match("#($c$v$c)$#", $str, $matches) && strlen($matches[1]) == 3 && $matches[1]{2} != 'w' && $matches[1]{2} != 'x' - && $matches[1]{2} != 'y'; - } -} diff --git a/administrator/components/com_finder/helpers/indexer/stemmer/snowball.php b/administrator/components/com_finder/helpers/indexer/stemmer/snowball.php deleted file mode 100644 index b41973434b4..00000000000 --- a/administrator/components/com_finder/helpers/indexer/stemmer/snowball.php +++ /dev/null @@ -1,133 +0,0 @@ -sef ?? '*'; - $lang = $defaultLang; - } - - // Stem the token if it is not in the cache. - if (!isset($this->cache[$lang][$token])) - { - // Get the stem function from the language string. - switch ($lang) - { - // Danish stemmer. - case 'da': - $function = 'stem_danish'; - break; - - // German stemmer. - case 'de': - $function = 'stem_german'; - break; - - // English stemmer. - default: - case 'en': - $function = 'stem_english'; - break; - - // Spanish stemmer. - case 'es': - $function = 'stem_spanish'; - break; - - // Finnish stemmer. - case 'fi': - $function = 'stem_finnish'; - break; - - // French stemmer. - case 'fr': - $function = 'stem_french'; - break; - - // Hungarian stemmer. - case 'hu': - $function = 'stem_hungarian'; - break; - - // Italian stemmer. - case 'it': - $function = 'stem_italian'; - break; - - // Norwegian stemmer. - case 'nb': - $function = 'stem_norwegian'; - break; - - // Dutch stemmer. - case 'nl': - $function = 'stem_dutch'; - break; - - // Portuguese stemmer. - case 'pt': - $function = 'stem_portuguese'; - break; - - // Romanian stemmer. - case 'ro': - $function = 'stem_romanian'; - break; - - // Russian stemmer. - case 'ru': - $function = 'stem_russian_unicode'; - break; - - // Swedish stemmer. - case 'sv': - $function = 'stem_swedish'; - break; - - // Turkish stemmer. - case 'tr': - $function = 'stem_turkish_unicode'; - break; - } - - // Stem the word if the stemmer method exists. - $this->cache[$lang][$token] = function_exists($function) ? $function($token) : $token; - } - - return $this->cache[$lang][$token]; - } -} diff --git a/administrator/components/com_finder/helpers/indexer/taxonomy.php b/administrator/components/com_finder/helpers/indexer/taxonomy.php index 92637310965..0b90dde30d3 100644 --- a/administrator/components/com_finder/helpers/indexer/taxonomy.php +++ b/administrator/components/com_finder/helpers/indexer/taxonomy.php @@ -10,7 +10,7 @@ defined('_JEXEC') or die; /** - * Stemmer base class for the Finder indexer package. + * Taxonomy base class for the Finder indexer package. * * @since 2.5 */ diff --git a/administrator/components/com_finder/helpers/indexer/token.php b/administrator/components/com_finder/helpers/indexer/token.php index ec804098360..40ea5ec9ff4 100644 --- a/administrator/components/com_finder/helpers/indexer/token.php +++ b/administrator/components/com_finder/helpers/indexer/token.php @@ -98,7 +98,14 @@ class FinderIndexerToken */ public function __construct($term, $lang, $spacer = ' ') { - $this->language = $lang; + if (!$lang) + { + $this->language = '*'; + } + else + { + $this->language = $lang; + } // Tokens can be a single word or an array of words representing a phrase. if (is_array($term)) diff --git a/administrator/language/en-GB/en-GB.com_finder.ini b/administrator/language/en-GB/en-GB.com_finder.ini index 3d427037f7b..1109182b3e6 100644 --- a/administrator/language/en-GB/en-GB.com_finder.ini +++ b/administrator/language/en-GB/en-GB.com_finder.ini @@ -35,11 +35,6 @@ COM_FINDER_CONFIG_SORT_OPTION_LIST_PRICE="List price" COM_FINDER_CONFIG_SORT_OPTION_RELEVANCE="Relevance" COM_FINDER_CONFIG_SORT_OPTION_START_DATE="Date" COM_FINDER_CONFIG_SORT_ORDER_LABEL="Sort Field" -COM_FINDER_CONFIG_STEMMER_ENABLE_LABEL="Enable Language Stemmer" -COM_FINDER_CONFIG_STEMMER_FR="French Only" -COM_FINDER_CONFIG_STEMMER_LABEL="Select Language Stemmer" -COM_FINDER_CONFIG_STEMMER_PORTER_EN="English Only" -COM_FINDER_CONFIG_STEMMER_SNOWBALL="Snowball" COM_FINDER_CONFIG_TEXT_MULTIPLIER_DESCRIPTION="The multiplier is used to control how much influence matching text has on the overall relevance score of a search result. A multiplier is considered in relationship to the other multipliers. The body text comes from the summary and/or body of the content." COM_FINDER_CONFIG_TEXT_MULTIPLIER_LABEL="Body Text Weight Multiplier" COM_FINDER_CONFIG_TITLE_MULTIPLIER_DESCRIPTION="The multiplier is used to control how much influence matching text has on the overall relevance score of a search result. A multiplier is considered in relationship to the other multipliers. The title text comes from the title of the content." diff --git a/build.xml b/build.xml index ee810a4e3cc..34f3e2859eb 100644 --- a/build.xml +++ b/build.xml @@ -37,6 +37,7 @@ + diff --git a/components/com_finder/Model/SearchModel.php b/components/com_finder/Model/SearchModel.php index c2423e89f37..a3fb724f8ae 100644 --- a/components/com_finder/Model/SearchModel.php +++ b/components/com_finder/Model/SearchModel.php @@ -18,9 +18,9 @@ use Joomla\Utilities\ArrayHelper; // Register dependent classes. define('FINDER_PATH_INDEXER', JPATH_ADMINISTRATOR . '/components/com_finder/helpers/indexer'); \JLoader::register('FinderIndexerHelper', FINDER_PATH_INDEXER . '/helper.php'); +\JLoader::register('FinderIndexerLanguage', FINDER_PATH_INDEXER . '/language.php'); \JLoader::register('FinderIndexerQuery', FINDER_PATH_INDEXER . '/query.php'); \JLoader::register('FinderIndexerResult', FINDER_PATH_INDEXER . '/result.php'); -\JLoader::register('FinderIndexerStemmer', FINDER_PATH_INDEXER . '/stemmer.php'); /** * Search model class for the Finder package. @@ -221,6 +221,7 @@ class SearchModel extends ListModel $query->where($db->quoteName('l.start_date') . ' = ' . $date2); } } + // Filter by language if ($this->getState('filter.language')) { @@ -266,12 +267,27 @@ class SearchModel extends ListModel * If there are no optional or required search terms in the query, we * can get the results in one relatively simple database query. */ - if (empty($this->includedTerms)) + if (empty($this->includedTerms) && $this->searchquery->empty) { // Return the results. return $query; } + /* + * If there are no optional or required search terms in the query and + * empty searches are not allowed, we return an empty query. + */ + if (empty($this->includedTerms) && !$this->searchquery->empty) + { + // Since we need to return a query, we simplify this one. + $query->clear('join') + ->clear('where') + ->clear('group') + ->where('false'); + + return $query; + } + $included = call_user_func_array('array_merge', $this->includedTerms); $query->join('INNER', $this->_db->quoteName('#__finder_links_terms') . ' AS m ON m.link_id = l.link_id') ->where('m.term_id IN (' . implode(',', $included) . ')'); @@ -291,10 +307,14 @@ class SearchModel extends ListModel */ if (count($this->requiredTerms)) { - $required = call_user_func_array('array_merge', $this->requiredTerms); - $query->join('INNER', $this->_db->quoteName('#__finder_links_terms') . ' AS r ON r.link_id = l.link_id') - ->where('r.term_id IN (' . implode(',', $required) . ')') - ->having('COUNT(DISTINCT r.term_id) = ' . count($required)); + $i = 0; + + foreach ($this->requiredTerms as $terms) + { + $query->join('INNER', $this->_db->quoteName('#__finder_links_terms') . ' AS r' . $i . ' ON r' . $i . '.link_id = l.link_id') + ->where('r' . $i . '.term_id IN (' . implode(',', $terms) . ')'); + $i++; + } } return $query; @@ -361,12 +381,6 @@ class SearchModel extends ListModel $this->setState('filter.language', Multilanguage::isEnabled()); - // Setup the stemmer. - if ($params->get('stem', 1) && $params->get('stemmer', 'porter_en')) - { - \FinderIndexerHelper::$stemmer = \FinderIndexerStemmer::getInstance($params->get('stemmer', 'porter_en')); - } - $request = $input->request; $options = array(); diff --git a/components/com_finder/Model/SuggestionsModel.php b/components/com_finder/Model/SuggestionsModel.php index 0993efe1b6d..3f86a89ff82 100644 --- a/components/com_finder/Model/SuggestionsModel.php +++ b/components/com_finder/Model/SuggestionsModel.php @@ -65,13 +65,14 @@ class SuggestionsModel extends ListModel // Create a new query object. $db = $this->getDbo(); $query = $db->getQuery(true); + $lang = \FinderIndexerHelper::getPrimaryLanguage($this->getState('language')); // Select required fields $query->select('t.term') ->from($db->quoteName('#__finder_terms') . ' AS t') ->where('t.term LIKE ' . $db->quote($db->escape($this->getState('input'), true) . '%')) ->where('t.common = 0') - ->where('t.language IN (' . $db->quote($db->escape($this->getState('language'), true)) . ', ' . $db->quote('*') . ')') + ->where('t.language IN (' . $db->quote($lang) . ', ' . $db->quote('*') . ')') ->order('t.links DESC') ->order('t.weight DESC'); diff --git a/composer.json b/composer.json index 36b1b6d11cb..c219c5b7863 100644 --- a/composer.json +++ b/composer.json @@ -80,7 +80,8 @@ "symfony/debug": "3.4.*", "symfony/ldap": "3.4.*", "symfony/web-link": "3.4.*", - "symfony/yaml": "3.4.*" + "symfony/yaml": "3.4.*", + "wamania/php-stemmer": "^1.2" }, "require-dev": { "phpunit/phpunit": "~6.0", diff --git a/composer.lock b/composer.lock index 4349fe50e9f..01d31e8268a 100644 --- a/composer.lock +++ b/composer.lock @@ -1,10 +1,10 @@ { "_readme": [ "This file locks the dependencies of your project to a known state", - "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", "This file is @generated automatically" ], - "content-hash": "3bd7695b38b737c3b555d83058b877fd", + "content-hash": "81c9ca521a0712b07e07143b469e835a", "packages": [ { "name": "composer/ca-bundle", @@ -2513,6 +2513,50 @@ "homepage": "https://symfony.com", "time": "2018-05-03T23:18:14+00:00" }, + { + "name": "wamania/php-stemmer", + "version": "1.2", + "source": { + "type": "git", + "url": "https://github.com/wamania/php-stemmer.git", + "reference": "6cc76829bddd46f7ae7678e0bf87a0c872c8cf58" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/wamania/php-stemmer/zipball/6cc76829bddd46f7ae7678e0bf87a0c872c8cf58", + "reference": "6cc76829bddd46f7ae7678e0bf87a0c872c8cf58", + "shasum": "" + }, + "require": { + "php": ">=5.3.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8" + }, + "type": "library", + "autoload": { + "psr-4": { + "Wamania\\Snowball\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Wamania", + "homepage": "http://wamania.com" + } + ], + "description": "Native PHP5 Stemmer", + "keywords": [ + "php", + "porter", + "stemmer" + ], + "time": "2017-01-27T17:16:44+00:00" + }, { "name": "zendframework/zend-diactoros", "version": "1.7.2", diff --git a/libraries/vendor/composer/ClassLoader.php b/libraries/vendor/composer/ClassLoader.php index dc02dfb114f..2c72175e772 100644 --- a/libraries/vendor/composer/ClassLoader.php +++ b/libraries/vendor/composer/ClassLoader.php @@ -379,9 +379,9 @@ class ClassLoader $subPath = substr($subPath, 0, $lastPos); $search = $subPath.'\\'; if (isset($this->prefixDirsPsr4[$search])) { - $pathEnd = DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $lastPos + 1); foreach ($this->prefixDirsPsr4[$search] as $dir) { - if (file_exists($file = $dir . $pathEnd)) { + $length = $this->prefixLengthsPsr4[$first][$search]; + if (file_exists($file = $dir . DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $length))) { return $file; } } diff --git a/libraries/vendor/composer/autoload_classmap.php b/libraries/vendor/composer/autoload_classmap.php index b6e387161c9..2c82b493040 100644 --- a/libraries/vendor/composer/autoload_classmap.php +++ b/libraries/vendor/composer/autoload_classmap.php @@ -946,6 +946,21 @@ return array( 'Symfony\\Polyfill\\Util\\BinaryOnFuncOverload' => $vendorDir . '/symfony/polyfill-util/BinaryOnFuncOverload.php', 'Symfony\\Polyfill\\Util\\LegacyTestListener' => $vendorDir . '/symfony/polyfill-util/LegacyTestListener.php', 'Symfony\\Polyfill\\Util\\TestListenerTrait' => $vendorDir . '/symfony/polyfill-util/TestListenerTrait.php', + 'Wamania\\Snowball\\Danish' => $vendorDir . '/wamania/php-stemmer/src/Danish.php', + 'Wamania\\Snowball\\Dutch' => $vendorDir . '/wamania/php-stemmer/src/Dutch.php', + 'Wamania\\Snowball\\English' => $vendorDir . '/wamania/php-stemmer/src/English.php', + 'Wamania\\Snowball\\French' => $vendorDir . '/wamania/php-stemmer/src/French.php', + 'Wamania\\Snowball\\German' => $vendorDir . '/wamania/php-stemmer/src/German.php', + 'Wamania\\Snowball\\Italian' => $vendorDir . '/wamania/php-stemmer/src/Italian.php', + 'Wamania\\Snowball\\Norwegian' => $vendorDir . '/wamania/php-stemmer/src/Norwegian.php', + 'Wamania\\Snowball\\Portuguese' => $vendorDir . '/wamania/php-stemmer/src/Portuguese.php', + 'Wamania\\Snowball\\Romanian' => $vendorDir . '/wamania/php-stemmer/src/Romanian.php', + 'Wamania\\Snowball\\Russian' => $vendorDir . '/wamania/php-stemmer/src/Russian.php', + 'Wamania\\Snowball\\Spanish' => $vendorDir . '/wamania/php-stemmer/src/Spanish.php', + 'Wamania\\Snowball\\Stem' => $vendorDir . '/wamania/php-stemmer/src/Stem.php', + 'Wamania\\Snowball\\Stemmer' => $vendorDir . '/wamania/php-stemmer/src/Stemmer.php', + 'Wamania\\Snowball\\Swedish' => $vendorDir . '/wamania/php-stemmer/src/Swedish.php', + 'Wamania\\Snowball\\Utf8' => $vendorDir . '/wamania/php-stemmer/src/Utf8.php', 'Zend\\Diactoros\\AbstractSerializer' => $vendorDir . '/zendframework/zend-diactoros/src/AbstractSerializer.php', 'Zend\\Diactoros\\CallbackStream' => $vendorDir . '/zendframework/zend-diactoros/src/CallbackStream.php', 'Zend\\Diactoros\\Exception\\DeprecatedMethodException' => $vendorDir . '/zendframework/zend-diactoros/src/Exception/DeprecatedMethodException.php', diff --git a/libraries/vendor/composer/autoload_psr4.php b/libraries/vendor/composer/autoload_psr4.php index ea5de7a75f6..db5c51e5d91 100644 --- a/libraries/vendor/composer/autoload_psr4.php +++ b/libraries/vendor/composer/autoload_psr4.php @@ -7,6 +7,7 @@ $baseDir = dirname(dirname($vendorDir)); return array( 'Zend\\Diactoros\\' => array($vendorDir . '/zendframework/zend-diactoros/src'), + 'Wamania\\Snowball\\' => array($vendorDir . '/wamania/php-stemmer/src'), 'Symfony\\Polyfill\\Util\\' => array($vendorDir . '/symfony/polyfill-util'), 'Symfony\\Polyfill\\Php56\\' => array($vendorDir . '/symfony/polyfill-php56'), 'Symfony\\Polyfill\\Mbstring\\' => array($vendorDir . '/symfony/polyfill-mbstring'), diff --git a/libraries/vendor/composer/autoload_static.php b/libraries/vendor/composer/autoload_static.php index 73ce2ad8fbc..50352e006e6 100644 --- a/libraries/vendor/composer/autoload_static.php +++ b/libraries/vendor/composer/autoload_static.php @@ -34,6 +34,10 @@ class ComposerStaticInita4c4383b02fcf9dfb95cc0397c641cf1 array ( 'Zend\\Diactoros\\' => 15, ), + 'W' => + array ( + 'Wamania\\Snowball\\' => 17, + ), 'S' => array ( 'Symfony\\Polyfill\\Util\\' => 22, @@ -110,6 +114,10 @@ class ComposerStaticInita4c4383b02fcf9dfb95cc0397c641cf1 array ( 0 => __DIR__ . '/..' . '/zendframework/zend-diactoros/src', ), + 'Wamania\\Snowball\\' => + array ( + 0 => __DIR__ . '/..' . '/wamania/php-stemmer/src', + ), 'Symfony\\Polyfill\\Util\\' => array ( 0 => __DIR__ . '/..' . '/symfony/polyfill-util', @@ -1233,6 +1241,21 @@ class ComposerStaticInita4c4383b02fcf9dfb95cc0397c641cf1 'Symfony\\Polyfill\\Util\\BinaryOnFuncOverload' => __DIR__ . '/..' . '/symfony/polyfill-util/BinaryOnFuncOverload.php', 'Symfony\\Polyfill\\Util\\LegacyTestListener' => __DIR__ . '/..' . '/symfony/polyfill-util/LegacyTestListener.php', 'Symfony\\Polyfill\\Util\\TestListenerTrait' => __DIR__ . '/..' . '/symfony/polyfill-util/TestListenerTrait.php', + 'Wamania\\Snowball\\Danish' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Danish.php', + 'Wamania\\Snowball\\Dutch' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Dutch.php', + 'Wamania\\Snowball\\English' => __DIR__ . '/..' . '/wamania/php-stemmer/src/English.php', + 'Wamania\\Snowball\\French' => __DIR__ . '/..' . '/wamania/php-stemmer/src/French.php', + 'Wamania\\Snowball\\German' => __DIR__ . '/..' . '/wamania/php-stemmer/src/German.php', + 'Wamania\\Snowball\\Italian' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Italian.php', + 'Wamania\\Snowball\\Norwegian' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Norwegian.php', + 'Wamania\\Snowball\\Portuguese' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Portuguese.php', + 'Wamania\\Snowball\\Romanian' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Romanian.php', + 'Wamania\\Snowball\\Russian' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Russian.php', + 'Wamania\\Snowball\\Spanish' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Spanish.php', + 'Wamania\\Snowball\\Stem' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Stem.php', + 'Wamania\\Snowball\\Stemmer' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Stemmer.php', + 'Wamania\\Snowball\\Swedish' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Swedish.php', + 'Wamania\\Snowball\\Utf8' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Utf8.php', 'Zend\\Diactoros\\AbstractSerializer' => __DIR__ . '/..' . '/zendframework/zend-diactoros/src/AbstractSerializer.php', 'Zend\\Diactoros\\CallbackStream' => __DIR__ . '/..' . '/zendframework/zend-diactoros/src/CallbackStream.php', 'Zend\\Diactoros\\Exception\\DeprecatedMethodException' => __DIR__ . '/..' . '/zendframework/zend-diactoros/src/Exception/DeprecatedMethodException.php', diff --git a/libraries/vendor/composer/installed.json b/libraries/vendor/composer/installed.json index 25ec8b10950..abc49529b01 100644 --- a/libraries/vendor/composer/installed.json +++ b/libraries/vendor/composer/installed.json @@ -2652,5 +2652,51 @@ "psr", "psr-7" ] + }, + { + "name": "wamania/php-stemmer", + "version": "1.2", + "version_normalized": "1.2.0.0", + "source": { + "type": "git", + "url": "https://github.com/wamania/php-stemmer.git", + "reference": "6cc76829bddd46f7ae7678e0bf87a0c872c8cf58" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/wamania/php-stemmer/zipball/6cc76829bddd46f7ae7678e0bf87a0c872c8cf58", + "reference": "6cc76829bddd46f7ae7678e0bf87a0c872c8cf58", + "shasum": "" + }, + "require": { + "php": ">=5.3.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8" + }, + "time": "2017-01-27T17:16:44+00:00", + "type": "library", + "installation-source": "dist", + "autoload": { + "psr-4": { + "Wamania\\Snowball\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Wamania", + "homepage": "http://wamania.com" + } + ], + "description": "Native PHP5 Stemmer", + "keywords": [ + "php", + "porter", + "stemmer" + ] } ] diff --git a/libraries/vendor/wamania/php-stemmer/LICENSE b/libraries/vendor/wamania/php-stemmer/LICENSE new file mode 100644 index 00000000000..5de1b166a74 --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2016 wamania + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/libraries/vendor/wamania/php-stemmer/src/Danish.php b/libraries/vendor/wamania/php-stemmer/src/Danish.php new file mode 100644 index 00000000000..357f573cfd3 --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Danish.php @@ -0,0 +1,149 @@ +word = Utf8::strtolower($word); + + // R2 is not used: R1 is defined in the same way as in the German stemmer + $this->r1(); + + // then R1 is adjusted so that the region before it contains at least 3 letters. + if ($this->r1Index < 3) { + $this->r1Index = 3; + $this->r1 = Utf8::substr($this->word, 3); + } + + // Do each of steps 1, 2 3 and 4. + $this->step1(); + $this->step2(); + $this->step3(); + $this->step4(); + + return $this->word; + } + + /** + * Define a valid s-ending as one of + * a b c d f g h j k l m n o p r t v y z å + * + * @param string $ending + * @return boolean + */ + private function hasValidSEnding($word) + { + $lastLetter = Utf8::substr($word, -1, 1); + return in_array($lastLetter, array('a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å')); + } + + /** + * Step 1 + * Search for the longest among the following suffixes in R1, and perform the action indicated. + */ + private function step1() + { + // hed ethed ered e erede ende erende ene erne ere en heden eren er heder erer + // heds es endes erendes enes ernes eres ens hedens erens ers ets erets et eret + // delete + if ( ($position = $this->searchIfInR1(array( + 'erendes', 'erende', 'hedens', 'erede', 'ethed', 'heden', 'endes', 'erets', 'heder', 'ernes', + 'erens', 'ered', 'ende', 'erne', 'eres', 'eren', 'eret', 'erer', 'enes', 'heds', + 'ens', 'ene', 'ere', 'ers', 'ets', 'hed', 'es', 'et', 'er', 'en', 'e' + ))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + + // s + // delete if preceded by a valid s-ending + if ( ($position = $this->searchIfInR1(array('s'))) !== false) { + $word = Utf8::substr($this->word, 0, $position); + if ($this->hasValidSEnding($word)) { + $this->word = $word; + } + return true; + } + } + + /** + * Step 2 + * Search for one of the following suffixes in R1, and if found delete the last letter. + * gd dt gt kt + */ + private function step2() + { + if ($this->searchIfInR1(array('gd', 'dt', 'gt', 'kt')) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + } + } + + /** + * Step 3: + */ + private function step3() + { + // If the word ends igst, remove the final st. + if ($this->search(array('igst')) !== false) { + $this->word = Utf8::substr($this->word, 0, -2); + } + + // Search for the longest among the following suffixes in R1, and perform the action indicated. + // ig lig elig els + // delete, and then repeat step 2 + if ( ($position = $this->searchIfInR1(array('elig', 'lig', 'ig', 'els'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + $this->step2(); + return true; + } + + // løst + // replace with løs + if ($this->searchIfInR1(array('løst')) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + } + } + + /** + * Step 4: undouble + * If the word ends with double consonant in R1, remove one of the consonants. + */ + private function step4() + { + $length = Utf8::strlen($this->word); + if (!$this->inR1(($length-1))) { + return false; + } + + $lastLetter = Utf8::substr($this->word, -1, 1); + if (in_array($lastLetter, self::$vowels)) { + return false; + } + $beforeLastLetter = Utf8::substr($this->word, -2, 1); + + if ($lastLetter == $beforeLastLetter) { + $this->word = Utf8::substr($this->word, 0, -1); + } + return true; + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Dutch.php b/libraries/vendor/wamania/php-stemmer/src/Dutch.php new file mode 100644 index 00000000000..8b8bb618553 --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Dutch.php @@ -0,0 +1,303 @@ +word = Utf8::strtolower($word); + + // First, remove all umlaut and acute accents. + $this->word = Utf8::str_replace( + array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'), + array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'), + $this->word); + + $this->plainVowels = implode('', self::$vowels); + + // Put initial y, y after a vowel, and i between vowels into upper case. + $this->word = preg_replace('#^y#u', 'Y', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); + + // R1 and R2 (see the note on R1 and R2) are then defined as in German. + // R1 and R2 are first set up in the standard way + $this->r1(); + $this->r2(); + + // but then R1 is adjusted so that the region before it contains at least 3 letters. + if ($this->r1Index < 3) { + $this->r1Index = 3; + $this->r1 = Utf8::substr($this->word, 3); + } + + // Do each of steps 1, 2 3 and 4. + $this->step1(); + $removedE = $this->step2(); + $this->step3a(); + $this->step3b($removedE); + $this->step4(); + $this->finish(); + + return $this->word; + } + + /** + * Define a valid s-ending as a non-vowel other than j. + * @param string $ending + * @return boolean + */ + private function hasValidSEnding($word) + { + $lastLetter = Utf8::substr($word, -1, 1); + return !in_array($lastLetter, array_merge(self::$vowels, array('j'))); + } + + /** + * Define a valid en-ending as a non-vowel, and not gem. + * @param string $ending + * @return boolean + */ + private function hasValidEnEnding($word) + { + $lastLetter = Utf8::substr($word, -1, 1); + if (in_array($lastLetter, self::$vowels)) { + return false; + } + + $threeLastLetters = Utf8::substr($word, -3, 3); + if ($threeLastLetters == 'gem') { + return false; + } + return true; + } + + /** + * Define undoubling the ending as removing the last letter if the word ends kk, dd or tt. + */ + private function unDoubling() + { + if ($this->search(array('kk', 'dd', 'tt')) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + } + } + + /** + * Step 1 + * Search for the longest among the following suffixes, and perform the action indicated + */ + private function step1() + { + // heden + // replace with heid if in R1 + if ( ($position = $this->search(array('heden'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(heden)$#u', 'heid', $this->word); + } + return true; + } + + // en ene + // delete if in R1 and preceded by a valid en-ending, and then undouble the ending + if ( ($position = $this->search(array('ene', 'en'))) !== false) { + if ($this->inR1($position)) { + $word = Utf8::substr($this->word, 0, $position); + if ($this->hasValidEnEnding($word)) { + $this->word = $word; + $this->unDoubling(); + } + } + return true; + } + + // s se + // delete if in R1 and preceded by a valid s-ending + if ( ($position = $this->search(array('se', 's'))) !== false) { + if ($this->inR1($position)) { + $word = Utf8::substr($this->word, 0, $position); + if ($this->hasValidSEnding($word)) { + $this->word = $word; + } + } + return true; + } + + return false; + } + + /** + * Step 2 + * Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending + */ + private function step2() + { + if ( ($position = $this->search(array('e'))) !== false) { + if ($this->inR1($position)) { + $letter = Utf8::substr($this->word, -2, 1); + if (!in_array($letter, self::$vowels)) { + $this->word = Utf8::substr($this->word, 0, $position); + $this->unDoubling(); + + return true; + } + } + } + + return false; + } + + /** + * Step 3a: heid + * delete heid if in R2 and not preceded by c, and treat a preceding en as in step 1(b) + */ + private function step3a() + { + if ( ($position = $this->search(array('heid'))) !== false) { + if ($this->inR2($position)) { + $letter = Utf8::substr($this->word, -5, 1); + if ($letter !== 'c') { + $this->word = Utf8::substr($this->word, 0, $position); + + if ( ($position = $this->search(array('en'))) !== false) { + if ($this->inR1($position)) { + $word = Utf8::substr($this->word, 0, $position); + if ($this->hasValidEnEnding($word)) { + $this->word = $word; + $this->unDoubling(); + } + } + } + } + } + } + + } + + /** + * Step 3b: d-suffixe + * Search for the longest among the following suffixes, and perform the action indicated. + */ + private function step3b($removedE) + { + // end ing + // delete if in R2 + // if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending + if ( ($position = $this->search(array('end', 'ing'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + + if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) { + $letter = Utf8::substr($this->word, -3, 1); + if ($letter !== 'e') { + $this->word = Utf8::substr($this->word, 0, $position2); + } + } else { + $this->unDoubling(); + } + } + + + return true; + } + + // ig + // delete if in R2 and not preceded by e + if ( ($position = $this->search(array('ig'))) !== false) { + if ($this->inR2($position)) { + $letter = Utf8::substr($this->word, -3, 1); + if ($letter !== 'e') { + $this->word = Utf8::substr($this->word, 0, $position); + } + } + return true; + } + + // lijk + // delete if in R2, and then repeat step 2 + if ( ($position = $this->search(array('lijk'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + $this->step2(); + } + return true; + } + + // baar + // delete if in R2 + if ( ($position = $this->search(array('baar'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // bar + // delete if in R2 and if step 2 actually removed an e + if ( ($position = $this->search(array('bar'))) !== false) { + if ($this->inR2($position) && $removedE) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + return false; + } + + /** + * Step 4: undouble vowel + * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, + * remove one of the vowels from V (for example, maan -> man, brood -> brod). + */ + private function step4() + { + // D is a non-vowel other than I + $d = Utf8::substr($this->word, -1, 1); + if (in_array($d, array_merge(self::$vowels, array('I')))) { + return false; + } + + // V is double a, e, o or u + $v = Utf8::substr($this->word, -3, 2); + if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) { + return false; + } + $singleV = Utf8::substr($v, 0, 1); + + // C is a non-vowel + $c = Utf8::substr($this->word, -4, 1); + if (in_array($c, self::$vowels)) { + return false; + } + + $this->word = Utf8::substr($this->word, 0, -4); + $this->word .= $c . $singleV .$d; + } + + /** + * Finally + * Turn I and Y back into lower case. + */ + private function finish() + { + $this->word = Utf8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word); + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/English.php b/libraries/vendor/wamania/php-stemmer/src/English.php new file mode 100644 index 00000000000..2ee72b5bc6e --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/English.php @@ -0,0 +1,599 @@ +word = Utf8::strtolower($word); + + // exceptions + if (null !== ($word = $this->exception1())) { + return $word; + } + + + $this->plainVowels = implode('', self::$vowels); + + // Remove initial ', if present. + $first = Utf8::substr($this->word, 0, 1); + if ($first == "'") { + $this->word = Utf8::substr($this->word, 1); + } + + // Set initial y, or y after a vowel, to Y + if ($first == 'y') { + $this->word = preg_replace('#^y#u', 'Y', $this->word); + } + $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); + + $this->r1(); + $this->exceptionR1(); + $this->r2(); + + $this->step0(); + $this->step1a(); + + // exceptions 2 + if (null !== ($word = $this->exception2())) { + return $word; + } + + $this->step1b(); + $this->step1c(); + $this->step2(); + $this->step3(); + $this->step4(); + $this->step5(); + $this->finish(); + + return $this->word; + } + + /** + * Step 0 + * Remove ', 's, 's' + */ + private function step0() + { + if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + } + } + + private function step1a() + { + // sses + // replace by ss + if ( ($position = $this->search(array('sses'))) !== false) { + $this->word = preg_replace('#(sses)$#u', 'ss', $this->word); + return true; + } + + // ied+ ies* + // replace by i if preceded by more than one letter, otherwise by ie (so ties -> tie, cries -> cri) + if ( ($position = $this->search(array('ied', 'ies'))) !== false) { + if ($position > 1) { + $this->word = preg_replace('#(ied|ies)$#u', 'i', $this->word); + + } else { + $this->word = preg_replace('#(ied|ies)$#u', 'ie', $this->word); + } + return true; + } + + // us+ ss + // do nothing + if ( ($position = $this->search(array('us', 'ss'))) !== false) { + return true; + } + + // s + // delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it) + if ( ($position = $this->search(array('s'))) !== false) { + for ($i=0; $i<$position-1; $i++) { + $letter = Utf8::substr($this->word, $i, 1); + + if (in_array($letter, self::$vowels)) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + return true; + } + + return false; + } + + /** + * Step 1b + */ + private function step1b() + { + // eed eedly+ + // replace by ee if in R1 + if ( ($position = $this->search(array('eedly', 'eed'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(eedly|eed)$#u', 'ee', $this->word); + } + return true; + } + + // ed edly+ ing ingly+ + // delete if the preceding word part contains a vowel, and after the deletion: + // if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or + // if the word ends with a double remove the last letter (so hopp -> hop), or + // if the word is short, add e (so hop -> hope) + if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) { + for ($i=0; $i<$position; $i++) { + $letter = Utf8::substr($this->word, $i, 1); + + if (in_array($letter, self::$vowels)) { + $this->word = Utf8::substr($this->word, 0, $position); + + if ($this->search(array('at', 'bl', 'iz')) !== false) { + $this->word .= 'e'; + + } elseif ( ($position2 = $this->search(self::$doubles)) !== false) { + $this->word = Utf8::substr($this->word, 0, ($position2+1)); + + } elseif ($this->isShort()) { + $this->word .= 'e'; + } + + return true; + } + } + return true; + } + + return false; + } + + /** + * Step 1c: * + */ + private function step1c() + { + // replace suffix y or Y by i if preceded by a non-vowel + // which is not the first letter of the word (so cry -> cri, by -> by, say -> say) + $length = Utf8::strlen($this->word); + + if ($length < 3) { + return true; + } + + if ( ($position = $this->search(array('y', 'Y'))) !== false) { + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + + if (! in_array($letter, self::$vowels)) { + $this->word = preg_replace('#(y|Y)$#u', 'i', $this->word); + } + + return true; + } + + return false; + } + + /** + * Step 2 + * Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated. + */ + private function step2() + { + // iveness iviti: replace by ive + if ( ($position = $this->search(array('iveness', 'iviti'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(iveness|iviti)$#u', 'ive', $this->word); + } + return true; + } + + // ousli ousness: replace by ous + if ( ($position = $this->search(array('ousli', 'ousness'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(ousli|ousness)$#u', 'ous', $this->word); + } + return true; + } + + // izer ization: replace by ize + if ( ($position = $this->search(array('izer', 'ization'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(izer|ization)$#u', 'ize', $this->word); + } + return true; + } + + // ational ation ator: replace by ate + if ( ($position = $this->search(array('ational', 'ation', 'ator'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(ational|ation|ator)$#u', 'ate', $this->word); + } + return true; + } + + // biliti bli+: replace by ble + if ( ($position = $this->search(array('biliti', 'bli'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(biliti|bli)$#u', 'ble', $this->word); + } + return true; + } + + // lessli+: replace by less + if ( ($position = $this->search(array('lessli'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(lessli)$#u', 'less', $this->word); + } + return true; + } + + // fulness: replace by ful + if ( ($position = $this->search(array('fulness', 'fulli'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(fulness|fulli)$#u', 'ful', $this->word); + } + return true; + } + + // tional: replace by tion + if ( ($position = $this->search(array('tional'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(tional)$#u', 'tion', $this->word); + } + return true; + } + + // alism aliti alli: replace by al + if ( ($position = $this->search(array('alism', 'aliti', 'alli'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(alism|aliti|alli)$#u', 'al', $this->word); + } + return true; + } + + // enci: replace by ence + if ( ($position = $this->search(array('enci'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(enci)$#u', 'ence', $this->word); + } + return true; + } + + // anci: replace by ance + if ( ($position = $this->search(array('anci'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(anci)$#u', 'ance', $this->word); + } + return true; + } + + // abli: replace by able + if ( ($position = $this->search(array('abli'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(abli)$#u', 'able', $this->word); + } + return true; + } + + // entli: replace by ent + if ( ($position = $this->search(array('entli'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(entli)$#u', 'ent', $this->word); + } + return true; + } + + // ogi+: replace by og if preceded by l + if ( ($position = $this->search(array('ogi'))) !== false) { + + if ($this->inR1($position)) { + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + + if ($letter == 'l') { + $this->word = preg_replace('#(ogi)$#u', 'og', $this->word); + } + } + + return true; + } + + // li+: delete if preceded by a valid li-ending + if ( ($position = $this->search(array('li'))) !== false) { + + if ($this->inR1($position)) { + // a letter for you + $letter = Utf8::substr($this->word, ($position-1), 1); + + if (in_array($letter, self::$liEnding)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + } + + return true; + } + + return false; + } + + /** + * Step 3: + * Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated. + */ + public function step3() + { + // ational+: replace by ate + if ($this->searchIfInR1(array('ational')) !== false) { + $this->word = preg_replace('#(ational)$#u', 'ate', $this->word); + return true; + } + + // tional+: replace by tion + if ($this->searchIfInR1(array('tional')) !== false) { + $this->word = preg_replace('#(tional)$#u', 'tion', $this->word); + return true; + } + + // alize: replace by al + if ($this->searchIfInR1(array('alize')) !== false) { + $this->word = preg_replace('#(alize)$#u', 'al', $this->word); + return true; + } + + // icate iciti ical: replace by ic + if ($this->searchIfInR1(array('icate', 'iciti', 'ical')) !== false) { + $this->word = preg_replace('#(icate|iciti|ical)$#u', 'ic', $this->word); + return true; + } + + // ful ness: delete + if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + + // ative*: delete if in R2 + if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) ) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + + return false; + } + + /** + * Step 4 + * Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated. + */ + public function step4() + { + // ement ance ence able ible ant ment ent ism ate iti ous ive ize al er ic + // delete + if ( ($position = $this->search(array( + 'ance', 'ence', 'ement', 'able', 'ible', 'ant', 'ment', 'ent', 'ism', + 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) { + + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // ion + // delete if preceded by s or t + if ( ($position = $this->searchIfInR2(array('ion'))) !== false) { + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + + if ($letter == 's' || $letter == 't') { + $this->word = Utf8::substr($this->word, 0, $position); + } + + return true; + } + + return false; + } + + /** + * Step 5: * + * Search for the the following suffixes, and, if found, perform the action indicated. + */ + public function step5() + { + // e + // delete if in R2, or in R1 and not preceded by a short syllable + if ( ($position = $this->search(array('e'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + + } elseif ($this->inR1($position)) { + if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) { + $this->word = Utf8::substr($this->word, 0, $position); + } + } + + return true; + } + + // l + // delete if in R2 and preceded by l + if ( ($position = $this->searchIfInR2(array('l'))) !== false) { + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + + if ($letter == 'l') { + $this->word = Utf8::substr($this->word, 0, $position); + } + + return true; + } + + return false; + } + + public function finish() + { + $this->word = Utf8::str_replace('Y', 'y', $this->word); + } + + private function exceptionR1() + { + if (Utf8::strpos($this->word, 'gener') === 0) { + $this->r1 = Utf8::substr($this->word, 5); + $this->r1Index = 5; + + } elseif (Utf8::strpos($this->word, 'commun') === 0) { + $this->r1 = Utf8::substr($this->word, 6); + $this->r1Index = 6; + + } elseif (Utf8::strpos($this->word, 'arsen') === 0) { + $this->r1 = Utf8::substr($this->word, 5); + $this->r1Index = 5; + } + } + + /** + * 1/ Stem certain special words as follows, + * 2/ If one of the following is found, leave it invariant, + */ + private function exception1() + { + $exceptions = array( + 'skis' => 'ski', + 'skies' => 'sky', + 'dying' => 'die', + 'lying' => 'lie', + 'tying' => 'tie', + 'idly' => 'idl', + 'gently' => 'gentl', + 'ugly' => 'ugli', + 'early' => 'earli', + 'only' => 'onli', + 'singly' => 'singl', + // invariants + 'sky' => 'sky', + 'news' => 'news', + 'howe' => 'howe', + 'atlas' => 'atlas', + 'cosmos' => 'cosmos', + 'bias' => 'bias', + 'andes' => 'andes' + ); + + if (isset($exceptions[$this->word])) { + return $exceptions[$this->word]; + } + + return null; + } + + /** + * Following step 1a, leave the following invariant, + */ + private function exception2() + { + $exceptions = array( + 'inning' => 'inning', + 'outing' => 'outing', + 'canning' => 'canning', + 'herring' => 'herring', + 'earring' => 'earring', + 'proceed' => 'proceed', + 'exceed' => 'exceed', + 'succeed' => 'succeed' + ); + + if (isset($exceptions[$this->word])) { + return $exceptions[$this->word]; + } + + return null; + } + + /** + * A word is called short if it ends in a short syllable, and if R1 is null. + * Note : R1 not really null, but the word at this state must be smaller than r1 index + * + * @return boolean + */ + private function isShort() + { + $length = Utf8::strlen($this->word); + return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) ); + } + + /** + * Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, + * or * (b) a vowel at the beginning of the word followed by a non-vowel. + * + * So rap, trap, entrap end with a short syllable, and ow, on, at are classed as short syllables. + * But uproot, bestow, disturb do not end with a short syllable. + */ + private function searchShortSyllabe($from, $nbLetters) + { + $length = Utf8::strlen($this->word); + + if ($from < 0) { + $from = $length + $from; + } + if ($from < 0) { + $from = 0; + } + + // (a) is just for beginning of the word + if ( ($nbLetters == 2) && ($from != 0) ) { + return false; + } + + $first = Utf8::substr($this->word, $from, 1); + $second = Utf8::substr($this->word, ($from+1), 1); + + if ($nbLetters == 2) { + if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) { + return true; + } + } + + $third = Utf8::substr($this->word, ($from+2), 1); + + if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) + && (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) { + return true; + } + + return false; + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/French.php b/libraries/vendor/wamania/php-stemmer/src/French.php new file mode 100644 index 00000000000..f839e844850 --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/French.php @@ -0,0 +1,530 @@ +word = Utf8::strtolower($word); + + $this->plainVowels = implode('', self::$vowels); + + $this->step0(); + + $this->rv(); + $this->r1(); + $this->r2(); + + // to know if step1, 2a or 2b have altered the word + $this->originalWord = $this->word; + + $nextStep = $this->step1(); + + // Do step 2a if either no ending was removed by step 1, or if one of endings amment, emment, ment, ments was found. + if ( ($nextStep == 2) || ($this->originalWord == $this->word) ) { + $modified = $this->step2a(); + if (!$modified) { + $this->step2b(); + } + } + + if ($this->word != $this->originalWord) { + $this->step3(); + + } else { + $this->step4(); + } + + $this->step5(); + $this->step6(); + $this->finish(); + + return $this->word; + } + + + + /** + * Assume the word is in lower case. + * Then put into upper case u or i preceded and followed by a vowel, and y preceded or followed by a vowel. + * u after q is also put into upper case. For example, + * jouer -> joUer + * ennuie -> ennuIe + * yeux -> Yeux + * quand -> qUand + */ + private function step0() + { + $this->word = preg_replace('#([q])u#u', '$1U', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); + $this->word = preg_replace('#y(['.$this->plainVowels.'])#u', 'Y$1', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); + } + + /** + * Step 1 + * Search for the longest among the following suffixes, and perform the action indicated. + * + * @return integer Next step number + */ + private function step1() + { + // ance iqUe isme able iste eux ances iqUes ismes ables istes + // delete if in R2 + if ( ($position = $this->search(array('ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe','isme', 'able', 'iste', 'eux'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return 3; + } + + // atrice ateur ation atrices ateurs ations + // delete if in R2 + // if preceded by ic, delete if in R2, else replace by iqU + if ( ($position = $this->search(array('atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + + if ( ($position2 = $this->searchIfInR2(array('ic'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position2); + } else { + $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); + } + } + + return 3; + } + + // logie logies + // replace with log if in R2 + if ( ($position = $this->search(array('logies', 'logie'))) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(logies|logie)$#u', 'log', $this->word); + } + return 3; + } + + // usion ution usions utions + // replace with u if in R2 + if ( ($position = $this->search(array('usions', 'utions', 'usion', 'ution'))) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(usion|ution|usions|utions)$#u', 'u', $this->word); + } + return 3; + } + + // ence ences + // replace with ent if in R2 + if ( ($position = $this->search(array('ences', 'ence'))) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(ence|ences)$#u', 'ent', $this->word); + } + return 3; + } + + // issement issements + // delete if in R1 and preceded by a non-vowel + if ( ($position = $this->search(array('issements', 'issement'))) != false) { + if ($this->inR1($position)) { + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + if (! in_array($letter, self::$vowels)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + } + return 3; + } + + // ement ements + // delete if in RV + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, + // if preceded by abl or iqU, delete if in R2, otherwise, + // if preceded by ièr or Ièr, replace by i if in RV + if ( ($position = $this->search(array('ements', 'ement'))) !== false) { + + // delete if in RV + if ($this->inRv($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + if ( ($position = $this->searchIfInR2(array('iv'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + + // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, + } elseif ( ($position = $this->search(array('eus'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + + } elseif ($this->inR1($position)) { + $this->word = preg_replace('#(eus)$#u', 'eux', $this->word); + } + + // if preceded by abl or iqU, delete if in R2, otherwise, + } elseif ( ($position = $this->searchIfInR2(array('abl', 'iqU'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + + // if preceded by ièr or Ièr, replace by i if in RV + } elseif ( ($position = $this->searchIfInRv(array('ièr', 'Ièr'))) !== false) { + $this->word = preg_replace('#(ièr|Ièr)$#u', 'i', $this->word); + } + return 3; + } + + // ité ités + // delete if in R2 + // if preceded by abil, delete if in R2, else replace by abl, otherwise, + // if preceded by ic, delete if in R2, else replace by iqU, otherwise, + // if preceded by iv, delete if in R2 + if ( ($position = $this->search(array('ités', 'ité'))) !== false) { + + // delete if in R2 + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by abil, delete if in R2, else replace by abl, otherwise, + if ( ($position = $this->search(array('abil'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } else { + $this->word = preg_replace('#(abil)$#u', 'abl', $this->word); + } + + // if preceded by ic, delete if in R2, else replace by iqU, otherwise, + } elseif ( ($position = $this->search(array('ic'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } else { + $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); + } + + // if preceded by iv, delete if in R2 + } elseif ( ($position = $this->searchIfInR2(array('iv'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + return 3; + } + + // if ive ifs ives + // delete if in R2 + // if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2, else replace by iqU) + if ( ($position = $this->search(array('ifs', 'ives', 'if', 'ive'))) !== false) { + + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + if ( ($position = $this->searchIfInR2(array('at'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + + if ( ($position2 = $this->search(array('ic'))) !== false) { + if ($this->inR2($position2)) { + $this->word = Utf8::substr($this->word, 0, $position2); + } else { + $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); + } + } + } + + return 3; + } + + // eaux + // replace with eau + if ( ($position = $this->search(array('eaux'))) !== false) { + $this->word = preg_replace('#(eaux)$#u', 'eau', $this->word); + return 3; + } + + // aux + // replace with al if in R1 + if ( ($position = $this->search(array('aux'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(aux)$#u', 'al', $this->word); + } + return 3; + } + + // euse euses + // delete if in R2, else replace by eux if in R1 + if ( ($position = $this->search(array('euses', 'euse'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + + } elseif ($this->inR1($position)) { + $this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word); + //return 3; + } + return 3; + } + + // amment + // replace with ant if in RV + if ( ($position = $this->search(array('amment'))) !== false) { + if ($this->inRv($position)) { + $this->word = preg_replace('#(amment)$#u', 'ant', $this->word); + } + return 2; + } + + // emment + // replace with ent if in RV + if ( ($position = $this->search(array('emment'))) !== false) { + if ($this->inRv($position)) { + $this->word = preg_replace('#(emment)$#u', 'ent', $this->word); + } + return 2; + } + + // ment ments + // delete if preceded by a vowel in RV + if ( ($position = $this->search(array('ments', 'ment'))) != false) { + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + if ( $this->inRv($before) && (in_array($letter, self::$vowels)) ) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + return 2; + } + + return 2; + } + + /** + * Step 2a: Verb suffixes beginning i + * In steps 2a and 2b all tests are confined to the RV region. + * Search for the longest among the following suffixes and if found, delete if preceded by a non-vowel. + * îmes ît îtes i ie ies ir ira irai iraIent irais irait iras irent irez iriez + * irions irons iront is issaIent issais issait issant issante issantes issants isse + * issent isses issez issiez issions issons it + * (Note that the non-vowel itself must also be in RV.) + */ + private function step2a() + { + if ( ($position = $this->searchIfInRv(array( + 'îmes', 'îtes', 'ît', 'ies', 'ie', 'iraIent', 'irais', 'irait', 'irai', 'iras', 'ira', 'irent', 'irez', 'iriez', + 'irions', 'irons', 'iront', 'ir', 'issaIent', 'issais', 'issait', 'issant', 'issantes', 'issante', 'issants', + 'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'))) !== false) { + + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + if ( $this->inRv($before) && (!in_array($letter, self::$vowels)) ) { + $this->word = Utf8::substr($this->word, 0, $position); + + return true; + } + } + + return false; + } + + /** + * Do step 2b if step 2a was done, but failed to remove a suffix. + * Step 2b: Other verb suffixes + */ + private function step2b() + { + // é ée ées és èrent er era erai eraIent erais erait eras erez eriez erions erons eront ez iez + // delete + if ( ($position = $this->searchIfInRv(array( + 'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez', + 'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez','és', 'ée', 'é'))) !== false) { + + $this->word = Utf8::substr($this->word, 0, $position); + + return true; + } + + // âmes ât âtes a ai aIent ais ait ant ante antes ants as asse assent asses assiez assions + // delete + // if preceded by e, delete + if ( ($position = $this->searchIfInRv(array( + 'âmes', 'âtes', 'ât', 'aIent', 'ais', 'ait', 'antes', 'ante', 'ants', 'ant', + 'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'))) !== false) { + + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + if ( $this->inRv($before) && ($letter == 'e') ) { + $this->word = Utf8::substr($this->word, 0, $before); + + } else { + $this->word = Utf8::substr($this->word, 0, $position); + } + + return true; + } + + // ions + // delete if in R2 + if ( ($position = $this->searchIfInRv(array('ions'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + return true; + } + + return false; + } + + /** + * Step 3: Replace final Y with i or final ç with c + */ + private function step3() + { + $this->word = preg_replace('#(Y)$#u', 'i', $this->word); + $this->word = preg_replace('#(ç)$#u', 'c', $this->word); + } + + /** + * Step 4: Residual suffix + */ + private function step4() + { + //If the word ends s, not preceded by a, i, o, u, è or s, delete it. + if (preg_match('#[^aiouès]s$#', $this->word)) { + $this->word = Utf8::substr($this->word, 0, -1); + } + + // In the rest of step 4, all tests are confined to the RV region. + // ion + // delete if in R2 and preceded by s or t + if ( (($position = $this->searchIfInRv(array('ion'))) !== false) && ($this->inR2($position)) ) { + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + if ( $this->inRv($before) && (($letter == 's') || ($letter == 't')) ) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // ier ière Ier Ière + // replace with i + if ( ($this->searchIfInRv(array('ier', 'ière', 'Ier', 'Ière'))) !== false) { + $this->word = preg_replace('#(ier|ière|Ier|Ière)$#u', 'i', $this->word); + return true; + } + + // e + // delete + if ( ($this->searchIfInRv(array('e'))) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + return true; + } + + // ë + // if preceded by gu, delete + if ( ($position = $this->searchIfInRv(array('guë'))) !== false) { + if ($this->inRv($position+2)) { + $this->word = Utf8::substr($this->word, 0, -1); + return true; + } + } + + return false; + } + + /** + * Step 5: Undouble + * If the word ends enn, onn, ett, ell or eill, delete the last letter + */ + private function step5() + { + if ($this->search(array('enn', 'onn', 'ett', 'ell', 'eill')) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + } + } + + /** + * Step 6: Un-accent + * If the words ends é or è followed by at least one non-vowel, remove the accent from the e. + */ + private function step6() + { + $this->word = preg_replace('#(é|è)([^'.$this->plainVowels.']+)$#u', 'e$2', $this->word); + } + + /** + * And finally: + * Turn any remaining I, U and Y letters in the word back into lower case. + */ + private function finish() + { + $this->word = Utf8::str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); + } + + /** + * If the word begins with two vowels, RV is the region after the third letter, + * otherwise the region after the first vowel not at the beginning of the word, + * or the end of the word if these positions cannot be found. + * (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) + */ + protected function rv() + { + $length = Utf8::strlen($this->word); + + $this->rv = ''; + $this->rvIndex = $length; + + if ($length < 3) { + return true; + } + + // If the word begins with two vowels, RV is the region after the third letter + $first = Utf8::substr($this->word, 0, 1); + $second = Utf8::substr($this->word, 1, 1); + + if ( (in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) ) { + $this->rv = Utf8::substr($this->word, 3); + $this->rvIndex = 3; + return true; + } + + // (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) + $begin3 = Utf8::substr($this->word, 0, 3); + if (in_array($begin3, array('par', 'col', 'tap'))) { + $this->rv = Utf8::substr($this->word, 3); + $this->rvIndex = 3; + return true; + } + + // otherwise the region after the first vowel not at the beginning of the word, + for ($i=1; $i<$length; $i++) { + $letter = Utf8::substr($this->word, $i, 1); + if (in_array($letter, self::$vowels)) { + $this->rv = Utf8::substr($this->word, ($i + 1)); + $this->rvIndex = $i + 1; + return true; + } + } + + return false; + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/German.php b/libraries/vendor/wamania/php-stemmer/src/German.php new file mode 100644 index 00000000000..949344241c4 --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/German.php @@ -0,0 +1,213 @@ +plainVowels = implode('', self::$vowels); + + $this->word = Utf8::strtolower($word); + + // First, replace ß by ss + $this->word = Utf8::str_replace('ß', 'ss', $this->word); + + // put u and y between vowels into upper case + $this->word = preg_replace('#(['.$this->plainVowels.'])y(['.$this->plainVowels.'])#u', '$1Y$2', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); + + // R1 and R2 are first set up in the standard way + $this->r1(); + $this->r2(); + + // but then R1 is adjusted so that the region before it contains at least 3 letters. + if ($this->r1Index < 3) { + $this->r1Index = 3; + $this->r1 = Utf8::substr($this->word, 3); + } + + $this->step1(); + $this->step2(); + $this->step3(); + $this->finish(); + + return $this->word; + } + + /** + * Step 1 + */ + public function step1() + { + // delete if in R1 + if ( ($position = $this->search(array('em', 'ern', 'er'))) !== false) { + if ($this->inR1($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // delete if in R1 + if ( ($position = $this->search(array('es', 'en', 'e'))) !== false) { + if ($this->inR1($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + + //If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s + if ($this->search(array('niss')) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + } + } + return true; + } + + // s (preceded by a valid s-ending) + if ( ($position = $this->search(array('s'))) !== false) { + if ($this->inR1($position)) { + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + + if (in_array($letter, self::$sEndings)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + } + return true; + } + + return false; + } + + /** + * Step 2 + */ + public function step2() + { + // en er est + // delete if in R1 + if ( ($position = $this->search(array('en', 'er', 'est'))) !== false) { + if ($this->inR1($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // st (preceded by a valid st-ending, itself preceded by at least 3 letters) + // delete if in R1 + if ( ($position = $this->search(array('st'))) !== false) { + if ($this->inR1($position)) { + $before = $position - 1; + if ($before >= 3) { + $letter = Utf8::substr($this->word, $before, 1); + + if (in_array($letter, self::$stEndings)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + } + } + return true; + } + return false; + } + + /** + * Step 3: d-suffixes + */ + public function step3() + { + // end ung + // delete if in R2 + // if preceded by ig, delete if in R2 and not preceded by e + if ( ($position = $this->search(array('end', 'ung'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + if ( ($position2 = $this->search(array('ig'))) !== false) { + $before = $position2 - 1; + $letter = Utf8::substr($this->word, $before, 1); + + if ( ($this->inR2($position2)) && ($letter != 'e') ) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + } + return true; + } + + // ig ik isch + // delete if in R2 and not preceded by e + if ( ($position = $this->search(array('ig', 'ik', 'isch'))) !== false) { + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + + if ( ($this->inR2($position)) && ($letter != 'e') ) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // lich heit + // delete if in R2 + // if preceded by er or en, delete if in R1 + if ( ($position = $this->search(array('lich', 'heit'))) != false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + if ( ($position2 = $this->search(array('er', 'en'))) !== false) { + if ($this->inR1($position2)) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + } + return true; + } + + // keit + // delete if in R2 + // if preceded by lich or ig, delete if in R2 + if ( ($position = $this->search(array('keit'))) != false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + if ( ($position2 = $this->search(array('lich', 'ig'))) !== false) { + if ($this->inR2($position2)) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + } + return true; + } + + return false; + } + + /** + * Finally + */ + public function finish() + { + // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. + $this->word = Utf8::str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word); + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Italian.php b/libraries/vendor/wamania/php-stemmer/src/Italian.php new file mode 100644 index 00000000000..62107407b1a --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Italian.php @@ -0,0 +1,286 @@ +plainVowels = implode('', self::$vowels); + + $this->word = Utf8::strtolower($word); + + // First, replace all acute accents by grave accents. + $this->word = Utf8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); + + //And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then + $this->word = preg_replace('#([q])u#u', '$1U', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); + + $this->rv(); + $this->r1(); + $this->r2(); + + $this->step0(); + + $word = $this->word; + $this->step1(); + + //Do step 2 if no ending was removed by step 1. + if ($word == $this->word) { + $this->step2(); + } + + $this->step3a(); + $this->step3b(); + $this->finish(); + + return $this->word; + } + + /** + * Step 0: Attached pronoun + */ + private function step0() + { + // Search for the longest among the following suffixes + if ( ($position = $this->search(array( + 'gliela', 'gliele', 'glieli', 'glielo', 'gliene', + 'sene', 'mela', 'mele', 'meli', 'melo', 'mene', 'tela', 'tele', 'teli', 'telo', 'tene', 'cela', + 'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene', + 'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'ci'))) !== false) { + + $suffixe = Utf8::substr($this->word, $position); + + // following one of (in RV) + // a + $a = array('ando', 'endo'); + $a = array_map(function($item) use ($suffixe) { + return $item . $suffixe; + }, $a); + // In case of (a) the suffix is deleted + if ($this->searchIfInRv($a) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + //b + $b = array('ar', 'er', 'ir'); + $b = array_map(function($item) use ($suffixe) { + return $item . $suffixe; + }, $b); + // in case (b) it is replace by e + if ($this->searchIfInRv($b) !== false) { + $this->word = preg_replace('#('.$suffixe.')$#u', 'e', $this->word); + } + + return true; + } + + return false; + } + + /** + * Step 1: Standard suffix removal + */ + private function step1() + { + // amente + // delete if in R1 + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + // if preceded by os, ic or abil, delete if in R2 + if ( ($position = $this->search(array('amente'))) !== false) { + if ($this->inR1($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position2); + if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position3); + } + + // if preceded by os, ic or ad, delete if in R2 + } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'abil'))) != false) { + $this->word = Utf8::substr($this->word, 0, $position4); + } + return true; + } + + // delete if in R2 + if ( ($position = $this->search(array( + 'ibili', 'atrice', 'abili', 'abile', 'ibile', 'atrici', 'mente', + 'anza', 'anze', 'iche', 'ichi', 'ismo', 'ismi', 'ista', 'iste', 'isti', 'istà', 'istè', 'istì', 'ante', 'anti', + 'ico', 'ici', 'ica', 'ice', 'oso', 'osi', 'osa', 'ose' + ))) !== false) { + + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // azione azioni atore atori + // delete if in R2 + // if preceded by ic, delete if in R2 + if ( ($position = $this->search(array('azione', 'azioni', 'atore', 'atori'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + + if ( ($position2 = $this->search(array('ic'))) !== false) { + if ($this->inR2($position2)) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + } + } + return true; + } + + // logia logie + // replace with log if in R2 + if ( ($position = $this->search(array('logia', 'logie'))) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(logia|logie)$#u', 'log', $this->word); + } + return true; + } + + // uzione uzioni usione usioni + // replace with u if in R2 + if ( ($position = $this->search(array('uzione', 'uzioni', 'usione', 'usioni'))) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(uzione|uzioni|usione|usioni)$#u', 'u', $this->word); + } + return true; + } + + // enza enze + // replace with ente if in R2 + if ( ($position = $this->search(array('enza', 'enze'))) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(enza|enze)$#u', 'ente', $this->word); + } + return true; + } + + // amento amenti imento imenti + // delete if in RV + if ( ($position = $this->search(array('amento', 'amenti', 'imento', 'imenti'))) !== false) { + if ($this->inRv($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // ità + // delete if in R2 + // if preceded by abil, ic or iv, delete if in R2 + if ( ($position = $this->search(array('ità'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + return true; + } + + // ivo ivi iva ive + // delete if in R2 + // if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2) + if ( ($position = $this->search(array('ivo', 'ivi', 'iva', 'ive'))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position2); + if ( ($position3 = $this->searchIfInR2(array('ic'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position3); + } + } + return true; + } + + return false; + } + + /** + * Step 2: Verb suffixes + * Search for the longest among the following suffixes in RV, and if found, delete. + */ + private function step2() + { + if ( ($position = $this->searchIfInRv(array( + 'assimo', 'assero', 'eranno', 'erebbero', 'erebbe', 'eremmo', 'ereste', 'eresti', 'essero', 'iranno', 'irebbero', 'irebbe', 'iremmo', + 'iscano', 'ireste', 'iresti', 'iscono', 'issero', + 'avamo', 'arono', 'avano', 'avate', 'eremo', 'erete', 'erono', 'evamo', 'evano', 'evate', 'ivamo', 'ivano', 'ivate', 'iremo', 'irete', 'irono', + 'ammo', 'ando', 'asse', 'assi', 'emmo', 'enda', 'ende', 'endi', 'endo', 'erai', 'erei', 'Yamo', 'iamo', 'immo', 'irà', 'irai', 'irei', + 'isca', 'isce', 'isci', 'isco', + 'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', 'erà', 'ere', 'erò', 'ete', 'eva', + 'evi', 'evo', 'ire', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'irò', 'ar', 'ir'))) !== false) { + + $this->word = Utf8::substr($this->word, 0, $position); + } + } + + /** + * Step 3a + * Delete a final a, e, i, o, à, è, ì or ò if it is in RV, and a preceding i if it is in RV + */ + private function step3a() + { + if ($this->searchIfInRv(array('a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò')) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + + if ($this->searchIfInRv(array('i')) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + } + return true; + } + return false; + } + + /** + * Step 3b + * Replace final ch (or gh) with c (or g) if in RV (crocch -> crocc) + */ + private function step3b() + { + if ($this->searchIfInRv(array('ch')) !== false) { + $this->word = preg_replace('#(ch)$#u', 'c', $this->word); + + } elseif ($this->searchIfInRv(array('gh')) !== false) { + $this->word = preg_replace('#(gh)$#u', 'g', $this->word); + } + } + + /** + * Finally + * turn I and U back into lower case + */ + private function finish() + { + $this->word = Utf8::str_replace(array('I', 'U'), array('i', 'u'), $this->word); + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Norwegian.php b/libraries/vendor/wamania/php-stemmer/src/Norwegian.php new file mode 100644 index 00000000000..bf9d2322f9d --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Norwegian.php @@ -0,0 +1,127 @@ +word = Utf8::strtolower($word); + + // R2 is not used: R1 is defined in the same way as in the German stemmer + $this->r1(); + + // then R1 is adjusted so that the region before it contains at least 3 letters. + if ($this->r1Index < 3) { + $this->r1Index = 3; + $this->r1 = Utf8::substr($this->word, 3); + } + + // Do each of steps 1, 2 3 and 4. + $this->step1(); + $this->step2(); + $this->step3(); + + return $this->word; + } + + /** + * Define a valid s-ending as one of + * b c d f g h j l m n o p r t v y z, + * or k not preceded by a vowel + * + * @param string $ending + * @return boolean + */ + private function hasValidSEnding($word) + { + $lastLetter = Utf8::substr($word, -1, 1); + if (in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z'))) { + return true; + } + if ($lastLetter == 'k') { + $beforeLetter = Utf8::substr($word, -2, 1); + if (!in_array($beforeLetter, self::$vowels)) { + return true; + } + } + return false; + } + + /** + * Step 1 + * Search for the longest among the following suffixes in R1, and perform the action indicated. + */ + private function step1() + { + // erte ert + // replace with er + if ( ($position = $this->searchIfInR1(array('erte', 'ert'))) !== false) { + $this->word = preg_replace('#(erte|ert)$#u', 'er', $this->word); + return true; + } + + // a e ede ande ende ane ene hetene en heten ar er heter as es edes endes enes hetenes ens hetens ers ets et het ast + // delete + if ( ($position = $this->searchIfInR1(array( + 'hetenes', 'hetene', 'hetens', 'heten', 'endes', 'heter', 'ande', 'ende', 'enes', 'edes', 'ede', 'ane', + 'ene', 'het', 'ers', 'ets', 'ast', 'ens', 'en', 'ar', 'er', 'as', 'es', 'et', 'a', 'e' + ))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + + // s + // delete if preceded by a valid s-ending + if ( ($position = $this->searchIfInR1(array('s'))) !== false) { + $word = Utf8::substr($this->word, 0, $position); + if ($this->hasValidSEnding($word)) { + $this->word = $word; + } + return true; + } + } + + /** + * Step 2 + * If the word ends dt or vt in R1, delete the t. + */ + private function step2() + { + if ($this->searchIfInR1(array('dt', 'vt')) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + } + } + + /** + * Step 3: + * Search for the longest among the following suffixes in R1, and if found, delete. + */ + private function step3() + { + // leg eleg ig eig lig elig els lov elov slov hetslov + if ( ($position = $this->searchIfInR1(array( + 'hetslov', 'eleg', 'elov', 'slov', 'elig', 'eig', 'lig', 'els', 'lov', 'leg', 'ig' + ))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + } + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Portuguese.php b/libraries/vendor/wamania/php-stemmer/src/Portuguese.php new file mode 100644 index 00000000000..7621b83a34a --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Portuguese.php @@ -0,0 +1,280 @@ +word = Utf8::strtolower($word); + + $this->word = Utf8::str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); + + $this->rv(); + $this->r1(); + $this->r2(); + + $word = $this->word; + $this->step1(); + + if ($word == $this->word) { + $this->step2(); + } + + if ($word != $this->word) { + $this->step3(); + } else { + $this->step4(); + } + + $this->step5(); + $this->finish(); + + return $this->word; + } + + /** + * Step 1: Standard suffix removal + */ + public function step1() + { + // delete if in R2 + if ( ($position = $this->search(array( + 'amentos', 'imentos', 'adoras', 'adores', 'amento', 'imento', 'adora', 'istas', 'ismos', 'antes', 'ância', + 'ezas', 'eza', 'icos', 'icas', 'ismo', 'ável', 'ível', 'ista', 'oso', + 'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es' , 'ante'))) !== false) { + + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // logía logías + // replace with log if in R2 + if ( ($position = $this->search(array('logías', 'logía'))) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(logías|logía)$#u', 'log', $this->word); + } + return true; + } + + // ución uciones + // replace with u if in R2 + if ( ($position = $this->search(array('uciones', 'ución'))) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(uciones|ución)$#u', 'u', $this->word); + } + return true; + } + + // ência ências + // replace with ente if in R2 + if ( ($position = $this->search(array('ências', 'ência'))) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(ências|ência)$#u', 'ente', $this->word); + } + return true; + } + + // amente + // delete if in R1 + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + // if preceded by os, ic or ad, delete if in R2 + if ( ($position = $this->search(array('amente'))) !== false) { + + // delete if in R1 + if ($this->inR1($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position2); + if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position3); + } + + // if preceded by os, ic or ad, delete if in R2 + } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position4); + } + return true; + } + + // mente + // delete if in R2 + // if preceded by ante, avel or ível, delete if in R2 + if ( ($position = $this->search(array('mente'))) !== false) { + + // delete if in R2 + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by ante, avel or ível, delete if in R2 + if ( ($position2 = $this->searchIfInR2(array('ante', 'avel', 'ível'))) != false) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + return true; + } + + // idade idades + // delete if in R2 + // if preceded by abil, ic or iv, delete if in R2 + if ( ($position = $this->search(array('idades', 'idade'))) !== false) { + + // delete if in R2 + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by abil, ic or iv, delete if in R2 + if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + return true; + } + + // iva ivo ivas ivos + // delete if in R2 + // if preceded by at, delete if in R2 + if ( ($position = $this->search(array('ivas', 'ivos', 'iva', 'ivo'))) !== false) { + + // delete if in R2 + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by at, delete if in R2 + if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + return true; + } + + // ira iras + // replace with ir if in RV and preceded by e + if ( ($position = $this->search(array('iras', 'ira'))) !== false) { + + if ($this->inRv($position)) { + $before = $position -1; + $letter = Utf8::substr($this->word, $before, 1); + + if ($letter == 'e') { + $this->word = preg_replace('#(iras|ira)$#u', 'ir', $this->word); + } + } + return true; + } + + return false; + } + + /** + * Step 2: Verb suffixes + * Search for the longest among the following suffixes in RV, and if found, delete. + */ + public function step2() + { + if ( ($position = $this->searchIfInRv(array( + 'aríamos', 'eríamos', 'iríamos', 'ássemos', 'êssemos', 'íssemos', + 'aríeis', 'eríeis', 'iríeis', 'ásseis', 'ésseis', 'ísseis', 'áramos', 'éramos', 'íramos', 'ávamos', + 'aremos', 'eremos', 'iremos', + 'ariam', 'eriam', 'iriam', 'assem', 'essem', 'issem', 'arias', 'erias', 'irias', 'ardes', 'erdes', 'irdes', + 'asses', 'esses', 'isses', 'astes', 'estes', 'istes', 'áreis', 'areis', 'éreis', 'ereis', 'íreis', 'ireis', + 'áveis', 'íamos', 'armos', 'ermos', 'irmos', + 'aria', 'eria', 'iria', 'asse', 'esse', 'isse', 'aste', 'este', 'iste', 'arei', 'erei', 'irei', 'adas', 'idas', + 'aram', 'eram', 'iram', 'avam', 'arem', 'erem', 'irem', 'ando', 'endo', 'indo', 'ara~o', 'era~o', 'ira~o', + 'arás', 'aras', 'erás', 'eras', 'irás', 'avas', 'ares', 'eres', 'ires', 'íeis', 'ados', 'idos', 'ámos', 'amos', + 'emos', 'imos', 'iras', + 'ada', 'ida', 'ará', 'ara', 'erá', 'era', 'irá', 'ava', 'iam', 'ado', 'ido', 'ias', 'ais', 'eis', 'ira', + 'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou', + ))) !== false) { + + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + return false; + } + + /** + * Step 3: d-suffixes + * + */ + public function step3() + { + // Delete suffix i if in RV and preceded by c + if ($this->searchIfInRv(array('i')) !== false) { + $letter = Utf8::substr($this->word, -2, 1); + + if ($letter == 'c') { + $this->word = Utf8::substr($this->word, 0, -1); + } + return true; + } + return false; + } + + /** + * Step 4 + */ + public function step4() + { + // If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it + if ( ($position = $this->searchIfInRv(array('os', 'a', 'i', 'o','á', 'í', 'ó'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + return false; + } + + /** + * Step 5 + */ + public function step5() + { + // If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i). + if ($this->searchIfInRv(array('e', 'é', 'ê')) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + + if ( ($position2 = $this->search(array('gu', 'ci'))) !== false) { + if ($this->inRv(($position2+1))) { + $this->word = Utf8::substr($this->word, 0, -1); + } + } + return true; + } else if ($this->search(array('ç')) !== false) { + $this->word = preg_replace('#(ç)$#u', 'c', $this->word); + return true; + } + return false; + } + + /** + * Finally + */ + public function finish() + { + // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. + $this->word = Utf8::str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word); + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Romanian.php b/libraries/vendor/wamania/php-stemmer/src/Romanian.php new file mode 100644 index 00000000000..b831ba437dd --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Romanian.php @@ -0,0 +1,331 @@ +word = Utf8::strtolower($word); + + $this->plainVowels = implode('', self::$vowels); + + // First, i and u between vowels are put into upper case (so that they are treated as consonants). + $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); + $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); + + $this->rv(); + $this->r1(); + $this->r2(); + + $this->step0(); + + $word1 = $this->word; + $word2 = $this->word; + + do { + $word1 = $this->word; + $this->step1(); + } while ($this->word != $word1); + + $this->step2(); + + // Do step 3 if no suffix was removed either by step 1 or step 2. + if ($word2 == $this->word) { + $this->step3(); + } + + $this->step4(); + $this->finish(); + + return $this->word; + } + + /** + * Step 0: Removal of plurals (and other simplifications) + * Search for the longest among the following suffixes, and, if it is in R1, perform the action indicated. + * @return boolean + */ + public function step0() + { + // ul ului + // delete + if ( ($position = $this->search(array('ul', 'ului'))) !== false) { + if ($this->inR1($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // aua + // replace with a + if ( ($position = $this->search(array('aua'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(aua)$#u', 'a', $this->word); + } + return true; + } + + // ea ele elor + // replace with e + if ( ($position = $this->search(array('ea', 'ele', 'elor'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(ea|ele|elor)$#u', 'e', $this->word); + } + return true; + } + + // ii iua iei iile iilor ilor + // replace with i + if ( ($position = $this->search(array('ii', 'iua', 'iei', 'iile', 'iilor', 'ilor'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(ii|iua|iei|iile|iilor|ilor)$#u', 'i', $this->word); + } + return true; + } + + // ile + // replace with i if not preceded by ab + if ( ($position = $this->search(array('ile'))) !== false) { + if ($this->inR1($position)) { + $before = Utf8::substr($this->word, ($position-2), 2); + + if ($before != 'ab') { + $this->word = preg_replace('#(ile)$#u', 'i', $this->word); + } + } + return true; + } + + // atei + // replace with at + if ( ($position = $this->search(array('atei'))) != false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(atei)$#u', 'at', $this->word); + } + return true; + } + + // aţie aţia + // replace with aţi + if ( ($position = $this->search(array('aţie', 'aţia'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(aţie|aţia)$#u', 'aţi', $this->word); + } + return true; + } + + return false; + } + + /** + * Step 1: Reduction of combining suffixes + * Search for the longest among the following suffixes, and, if it is in R1, preform the replacement action indicated. + * Then repeat this step until no replacement occurs. + * @return boolean + */ + public function step1() + { + // abilitate abilitati abilităi abilităţi + // replace with abil + if ( ($position = $this->search(array('abilitate', 'abilitati', 'abilităi', 'abilităţi'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(abilitate|abilitati|abilităi|abilităţi)$#u', 'abil', $this->word); + } + return true; + } + + // ibilitate + // replace with ibil + if ( ($position = $this->search(array('ibilitate'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(ibilitate)$#u', 'ibil', $this->word); + } + return true; + } + + // ivitate ivitati ivităi ivităţi + // replace with iv + if ( ($position = $this->search(array('ivitate', 'ivitati', 'ivităi', 'ivităţi'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(ivitate|ivitati|ivităi|ivităţi)$#u', 'iv', $this->word); + } + return true; + } + + // icitate icitati icităi icităţi icator icatori iciv iciva icive icivi icivă ical icala icale icali icală + // replace with ic + if ( ($position = $this->search(array( + 'icitate', 'icitati', 'icităi', 'icităţi', 'icatori', 'icator', 'iciva', + 'icive', 'icivi', 'icivă', 'icala', 'icale', 'icali', 'icală', 'iciv', 'ical'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(icitate|icitati|icităi|icităţi|cator|icatori|iciva|icive|icivi|icivă|icala|icale|icali|icală|ical|iciv)$#u', 'ic', $this->word); + } + return true; + } + + // ativ ativa ative ativi ativă aţiune atoare ator atori ătoare ător ători + // replace with at + if ( ($position = $this->search(array('ativa', 'ative', 'ativi', 'ativă', 'ativ', 'aţiune', 'atoare', 'atori', 'ătoare', 'ători', 'ător', 'ator'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(ativa|ative|ativi|ativă|ativ|aţiune|atoare|atori|ătoare|ători|ător|ator)$#u', 'at', $this->word); + } + return true; + } + + // itiv itiva itive itivi itivă iţiune itoare itor itori + // replace with it + if ( ($position = $this->search(array('itiva', 'itive', 'itivi', 'itivă', 'itiv', 'iţiune', 'itoare', 'itori', 'itor'))) !== false) { + if ($this->inR1($position)) { + $this->word = preg_replace('#(itiva|itive|itivi|itivă|itiv|iţiune|itoare|itori|itor)$#u', 'it', $this->word); + } + return true; + } + + return false; + } + + /** + * Step 2: Removal of 'standard' suffixes + * Search for the longest among the following suffixes, and, if it is in R2, perform the action indicated. + * @return boolean + */ + public function step2() + { + // atori itate itati, ităţi, abila abile abili abilă, ibila ibile ibili ibilă + // anta, ante, anti, antă, ator, ibil, oasa oasă oase, ităi, abil + // osi oşi ant ici ică iva ive ivi ivă ata ată ati ate, ata ată ati ate uta ută uti ute, ita ită iti ite ica ice + // at, os, iv, ut, it, ic + // delete + if ( ($position = $this->search(array( + 'atori', 'itate', 'itati', 'ităţi', 'abila', 'abile', 'abili', 'abilă', 'ibila', 'ibile', 'ibili', 'ibilă', + 'anta', 'ante', 'anti', 'antă', 'ator', 'ibil', 'oasa', 'oasă', 'oase', 'ităi', 'abil', + 'osi', 'oşi', 'ant', 'ici', 'ică', 'iva', 'ive', 'ivi', 'ivă', 'ata', 'ată', 'ati', 'ate', 'ata', 'ată', + 'ati', 'ate', 'uta', 'ută', 'uti', 'ute', 'ita', 'ită', 'iti', 'ite', 'ica', 'ice', + 'at', 'os', 'iv', 'ut', 'it', 'ic' + ))) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // iune iuni + // delete if preceded by ţ, and replace the ţ by t. + if ( ($position = $this->search(array('iune', 'iuni'))) !== false) { + if ($this->inR2($position)) { + $before = $position - 1; + $letter = Utf8::substr($this->word, $before, 1); + if ($letter == 'ţ') { + $this->word = Utf8::substr($this->word, 0, $position); + $this->word = preg_replace('#(ţ)$#u', 't', $this->word); + } + } + return true; + } + + // ism isme ist ista iste isti istă işti + // replace with ist + if ( ($position = $this->search(array('isme', 'ism', 'ista', 'iste', 'isti', 'istă', 'işti', 'ist'))) !== false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(isme|ism|ista|iste|isti|istă|işti|ist)$#u', 'ist', $this->word); + } + return true; + } + + return false; + } + + /** + * Step 3: Removal of verb suffixes + * Do step 3 if no suffix was removed either by step 1 or step 2. + * @return boolean + */ + public function step3() + { + // are ere ire âre ind ând indu ându eze ească ez ezi ează esc eşti + // eşte ăsc ăşti ăşte am ai au eam eai ea eaţi eau iam iai ia iaţi + // iau ui aşi arăm arăţi ară uşi urăm urăţi ură işi irăm irăţi iră âi + // âşi ârăm ârăţi âră asem aseşi ase aserăm aserăţi aseră isem iseşi ise + // iserăm iserăţi iseră âsem âseşi âse âserăm âserăţi âseră usem useşi use userăm userăţi useră + // delete if preceded in RV by a consonant or u + if ( ($position = $this->searchIfInRv(array( + 'userăţi', 'iserăţi', 'âserăţi', 'aserăţi', + 'userăm', 'iserăm', 'âserăm', 'aserăm', + 'iseră', 'âseşi', 'useră', 'âseră', 'useşi', 'iseşi', 'aseră', 'aseşi', 'ârăţi', 'irăţi', 'urăţi', 'arăţi', 'ească', + 'usem', 'âsem', 'isem', 'asem', 'ârăm', 'urăm', 'irăm', 'arăm', 'iaţi', 'eaţi', 'ăşte', 'ăşti', 'eşte', 'eşti', 'ează', 'ându', 'indu', + 'âse', 'use', 'ise', 'ase', 'âră', 'iră', 'işi', 'ură', 'uşi', 'ară', 'aşi', 'âşi', 'iau', 'iai', 'iam', 'eau', 'eai', 'eam', 'ăsc', + 'are', 'ere', 'ire', 'âre', 'ind', 'ând', 'eze', 'ezi', 'esc', + 'âi', 'ui', 'ia', 'ea', 'au', 'ai', 'am', 'ez' + ))) !== false) { + if ($this->inRv($position)) { + $before = $position - 1; + if ($this->inRv($before)) { + $letter = Utf8::substr($this->word, $before, 1); + + if ( (!in_array($letter, self::$vowels)) || ($letter == 'u') ) { + $this->word = Utf8::substr($this->word, 0, $position); + } + } + } + return true; + } + + + + // ăm aţi em eţi im iţi âm âţi seşi serăm serăţi seră sei se sesem seseşi sese seserăm seserăţi seseră + // delete + if ( ($position = $this->searchIfInRv(array( + 'seserăm', 'seserăţi', 'seseră', 'seseşi', 'sesem', 'serăţi', 'serăm', 'seşi', 'sese', 'seră', + 'aţi', 'eţi', 'iţi', 'âţi', 'sei', 'se', 'ăm', 'âm', 'em', 'im' + ))) !== false) { + if ($this->inRv($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + } + + /** + * Step 4: Removal of final vowel + */ + public function step4() + { + // Search for the longest among the suffixes "a e i ie ă " and, if it is in RV, delete it. + if ( ($position = $this->search(array('a', 'ie', 'e', 'i', 'ă'))) !== false) { + if ($this->inRv($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + } + + return true; + } + + /** + * Finally + * Turn I, U back into i, u + */ + public function finish() + { + // Turn I, U back into i, u + $this->word = Utf8::str_replace(array('I', 'U'), array('i', 'u'), $this->word); + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Russian.php b/libraries/vendor/wamania/php-stemmer/src/Russian.php new file mode 100644 index 00000000000..61d10afb354 --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Russian.php @@ -0,0 +1,249 @@ +word = Utf8::strtolower($word); + + // R2 is not used: R1 is defined in the same way as in the German stemmer + $this->r1(); + $this->r2(); + $this->rv(); + + // Do each of steps 1, 2 3 and 4. + $this->step1(); + $this->step2(); + $this->step3(); + $this->step4(); + + return $this->word; + } + + /** + * Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1. + * Otherwise try and remove a REFLEXIVE ending, and then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a NOUN ending. + * As soon as one of the endings (1) to (3) is found remove it, and terminate step 1. + */ + public function step1() + { + // Search for a PERFECTIVE GERUND ending. + // group 1 + if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[0])) !== false) { + if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + + // group 2 + if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[1])) !== false) { + if ($this->inRv($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + + // Otherwise try and remove a REFLEXIVE ending + if ( ($position = $this->searchIfInRv(self::$reflexive)) !== false) { + if ($this->inRv($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + } + + // then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a NOUN ending. + // As soon as one of the endings (1) to (3) is found remove it, and terminate step 1. + if ( ($position = $this->searchIfInRv(self::$adjective)) !== false) { + if ($this->inRv($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + + if ( ($position2 = $this->search(self::$participle[0])) !== false) { + if ( ($this->inRv($position2)) && ($this->checkGroup1($position2)) ) { + $this->word = Utf8::substr($this->word, 0, $position2); + return true; + } + } + + if ( ($position2 = $this->search(self::$participle[1])) !== false) { + if ($this->inRv($position2)) { + $this->word = Utf8::substr($this->word, 0, $position2); + return true; + } + } + + return true; + } + } + + if ( ($position = $this->searchIfInRv(self::$verb[0])) !== false) { + if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + + if ( ($position = $this->searchIfInRv(self::$verb[1])) !== false) { + if ($this->inRv($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + + if ( ($position = $this->searchIfInRv(self::$noun)) !== false) { + if ($this->inRv($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + + return false; + } + + /** + * Step 2: If the word ends with и (i), remove it. + */ + public function step2() + { + if ( ($position = $this->searchIfInRv(array('и'))) !== false) { + if ($this->inRv($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + return false; + } + + /** + * Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2), + * and if one is found, remove it. + */ + public function step3() + { + if ( ($position = $this->searchIfInRv(self::$derivational)) !== false) { + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + } + + /** + * Step 4: (1) Undouble н (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it + * and undouble н (n), or (3) if the word ends ь (') (soft sign) remove it. + */ + public function step4() + { + // (2) if the word ends with a SUPERLATIVE ending, remove it + if ( ($position = $this->searchIfInRv(self::$superlative)) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // (1) Undouble н (n) + if ( ($position = $this->searchIfInRv(array('нн'))) !== false) { + $this->word = Utf8::substr($this->word, 0, ($position+1)); + return true; + } + + // (3) if the word ends ь (') (soft sign) remove it + if ( ($position = $this->searchIfInRv(array('ь'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + + /** + * In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel. + */ + protected function rv() + { + $length = Utf8::strlen($this->word); + + $this->rv = ''; + $this->rvIndex = $length; + + for ($i=0; $i<$length; $i++) { + $letter = Utf8::substr($this->word, $i, 1); + if (in_array($letter, self::$vowels)) { + $this->rv = Utf8::substr($this->word, ($i+1)); + $this->rvIndex = $i + 1; + return true; + } + } + + return false; + } + + /** + * group 1 endings must follow а (a) or я (ia) + * + * @param integer $position + * @return boolean + */ + private function checkGroup1($position) + { + if (! $this->inRv(($position-1))) { + return false; + } + + $letter = Utf8::substr($this->word, ($position - 1), 1); + + if ($letter == 'а' || $letter == 'я') { + return true; + } + return false; + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Spanish.php b/libraries/vendor/wamania/php-stemmer/src/Spanish.php new file mode 100644 index 00000000000..c3563f85335 --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Spanish.php @@ -0,0 +1,345 @@ +word = Utf8::strtolower($word); + + $this->rv(); + $this->r1(); + $this->r2(); + + $this->step0(); + + $word = $this->word; + $this->step1(); + + // Do step 2a if no ending was removed by step 1. + if ($this->word == $word) { + $this->step2a(); + + // Do Step 2b if step 2a was done, but failed to remove a suffix. + if ($this->word == $word) { + $this->step2b(); + } + } + + $this->step3(); + $this->finish(); + + return $this->word; + } + + /** + * Step 0: Attached pronoun + * + * Search for the longest among the following suffixes + * me se sela selo selas selos la le lo las les los nos + * + * and delete it, if comes after one of + * (a) iéndo ándo ár ér ír + * (b) ando iendo ar er ir + * (c) yendo following u + * + * in RV. In the case of (c), yendo must lie in RV, but the preceding u can be outside it. + * In the case of (a), deletion is followed by removing the acute accent (for example, haciéndola -> haciendo). + */ + private function step0() + { + if ( ($position = $this->searchIfInRv(array('selas', 'selos', 'las', 'los', 'les', 'nos', 'selo', 'sela', 'me', 'se', 'la', 'le', 'lo' ))) != false) { + $suffixe = Utf8::substr($this->word, $position); + + // a + $a = array('iéndo', 'ándo', 'ár', 'ér', 'ír'); + $a = array_map(function($item) use ($suffixe) { + return $item . $suffixe; + }, $a); + + if ( ($position2 = $this->searchIfInRv($a)) !== false) { + $suffixe2 = Utf8::substr($this->word, $position2); + $suffixe2 = Utf8::deaccent($suffixe2, -1); + $this->word = Utf8::substr($this->word, 0, $position2); + $this->word .= $suffixe2; + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + + // b + $b = array('iendo', 'ando', 'ar', 'er', 'ir'); + $b = array_map(function($item) use ($suffixe) { + return $item . $suffixe; + }, $b); + + if ( ($position2 = $this->searchIfInRv($b)) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + + // c + if ( ($position2 = $this->searchIfInRv(array('yendo' . $suffixe))) != false) { + $before = Utf8::substr($this->word, ($position2-1), 1); + if ( (isset($before)) && ($before == 'u') ) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + } + + return false; + } + + /** + * Step 1 + */ + private function step1() + { + // anza anzas ico ica icos icas ismo ismos able ables ible ibles ista + // istas oso osa osos osas amiento amientos imiento imientos + // delete if in R2 + if ( ($position = $this->search(array( + 'imientos', 'imiento', 'amientos', 'amiento', 'osas', 'osos', 'osa', 'oso', 'istas', 'ista', 'ibles', + 'ible', 'ables', 'able', 'ismos', 'ismo', 'icas', 'icos', 'ica', 'ico', 'anzas', 'anza'))) != false) { + + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + return true; + } + + // adora ador ación adoras adores aciones ante antes ancia ancias + // delete if in R2 + // if preceded by ic, delete if in R2 + if ( ($position = $this->search(array( + 'adoras', 'adora', 'aciones', 'ación', 'adores', 'ador', 'antes', 'ante', 'ancias', 'ancia'))) != false) { + + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + if ( ($position2 = $this->searchIfInR2(array('ic')))) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + return true; + } + + // logía logías + // replace with log if in R2 + if ( ($position = $this->search(array('logías', 'logía'))) != false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(logías|logía)$#u', 'log', $this->word); + } + return true; + } + + // ución uciones + // replace with u if in R2 + if ( ($position = $this->search(array('uciones', 'ución'))) != false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(uciones|ución)$#u', 'u', $this->word); + } + return true; + } + + // encia encias + // replace with ente if in R2 + if ( ($position = $this->search(array('encias', 'encia'))) != false) { + if ($this->inR2($position)) { + $this->word = preg_replace('#(encias|encia)$#u', 'ente', $this->word); + } + return true; + } + + // amente + // delete if in R1 + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + // if preceded by os, ic or ad, delete if in R2 + if ( ($position = $this->search(array('amente'))) != false) { + + // delete if in R1 + if ($this->inR1($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, + if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position2); + if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position3); + } + + // if preceded by os, ic or ad, delete if in R2 + } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) != false) { + $this->word = Utf8::substr($this->word, 0, $position4); + } + return true; + } + + // mente + // delete if in R2 + // if preceded by ante, able or ible, delete if in R2 + if ( ($position = $this->search(array('mente'))) != false) { + + // delete if in R2 + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by ante, able or ible, delete if in R2 + if ( ($position2 = $this->searchIfInR2(array('ante', 'able', 'ible'))) != false) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + return true; + } + + // idad idades + // delete if in R2 + // if preceded by abil, ic or iv, delete if in R2 + if ( ($position = $this->search(array('idades', 'idad'))) != false) { + + // delete if in R2 + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by abil, ic or iv, delete if in R2 + if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + return true; + } + + // iva ivo ivas ivos + // delete if in R2 + // if preceded by at, delete if in R2 + if ( ($position = $this->search(array('ivas', 'ivos', 'iva', 'ivo'))) != false) { + + // delete if in R2 + if ($this->inR2($position)) { + $this->word = Utf8::substr($this->word, 0, $position); + } + + // if preceded by at, delete if in R2 + if ( ($position2 = $this->searchIfInR2(array('at'))) != false) { + $this->word = Utf8::substr($this->word, 0, $position2); + } + return true; + } + + return false; + } + + /** + * Step 2a: Verb suffixes beginning y + */ + private function step2a() + { + // if found, delete if preceded by u + // (Note that the preceding u need not be in RV.) + if ( ($position = $this->searchIfInRv(array( + 'yamos', 'yendo', 'yeron', 'yan', 'yen', 'yais', 'yas', 'yes', 'yo', 'yó', 'ya', 'ye'))) != false) { + + $before = Utf8::substr($this->word, ($position-1), 1); + if ( (isset($before)) && ($before == 'u') ) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + } + + return false; + } + + /** + * Step 2b: Other verb suffixes + * Search for the longest among the following suffixes in RV, and perform the action indicated. + */ + private function step2b() + { + // delete + if ( ($position = $this->searchIfInRv(array( + 'iésemos', 'iéramos', 'ábamos', 'iríamos', 'eríamos', 'aríamos', 'áramos', 'ásemos', 'eríais', + 'aremos', 'eremos', 'iremos', 'asteis', 'ieseis', 'ierais', 'isteis', 'aríais', + 'irían', 'aréis', 'erían', 'erías', 'eréis', 'iréis', 'irías', 'ieran', 'iesen', 'ieron', 'iendo', 'ieras', + 'iríais', 'arían', 'arías', + 'amos', 'imos', 'ados', 'idos', 'irán', 'irás', 'erán', 'erás', 'ería', 'iría', 'íais', 'arán', 'arás', 'aría', + 'iera', 'iese', 'aste', 'iste', 'aban', 'aran', 'asen', 'aron', 'ando', 'abas', 'adas', 'idas', 'ases', 'aras', + 'aré', 'erá', 'eré', 'áis', 'ías', 'irá', 'iré', 'aba', 'ían', 'ada', 'ara', 'ase', 'ida', 'ado', 'ido', 'ará', + 'ad', 'ed', 'id', 'ís', 'ió', 'ar', 'er', 'ir', 'as', 'ía', 'an' + ))) != false) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + + // en es éis emos + // delete, and if preceded by gu delete the u (the gu need not be in RV) + if ( ($position = $this->searchIfInRv(array('éis', 'emos', 'en', 'es'))) != false) { + $this->word = Utf8::substr($this->word, 0, $position); + + if ( ($position2 = $this->search(array('gu'))) != false) { + $this->word = Utf8::substr($this->word, 0, ($position2+1)); + } + + + return true; + } + } + + /** + * Step 3: residual suffix + * Search for the longest among the following suffixes in RV, and perform the action indicated. + */ + private function step3() + { + // os a o á í ó + // delete if in RV + if ( ($position = $this->searchIfInRv(array('os', 'a', 'o', 'á', 'í', 'ó'))) != false) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + + // e é + // delete if in RV, and if preceded by gu with the u in RV delete the u + if ( ($position = $this->searchIfInRv(array('e', 'é'))) != false) { + $this->word = Utf8::substr($this->word, 0, $position); + + if ( ($position2 = $this->searchIfInRv(array('u'))) != false) { + $before = Utf8::substr($this->word, ($position2-1), 1); + if ( (isset($before)) && ($before == 'g') ) { + $this->word = Utf8::substr($this->word, 0, $position2); + return true; + } + } + } + + return false; + } + + /** + * And finally: + * Remove acute accents + */ + private function finish() + { + $this->word = Utf8::str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word); + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Stem.php b/libraries/vendor/wamania/php-stemmer/src/Stem.php new file mode 100644 index 00000000000..642ece8d0c0 --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Stem.php @@ -0,0 +1,218 @@ += $this->rvIndex); + } + + protected function inR1($position) + { + return ($position >= $this->r1Index); + } + + protected function inR2($position) + { + return ($position >= $this->r2Index); + } + + protected function searchIfInRv($suffixes) + { + return $this->search($suffixes, $this->rvIndex); + } + + protected function searchIfInR1($suffixes) + { + return $this->search($suffixes, $this->r1Index); + } + + protected function searchIfInR2($suffixes) + { + return $this->search($suffixes, $this->r2Index); + } + + protected function search($suffixes, $offset = 0) + { + $length = Utf8::strlen($this->word); + if ($offset > $length) { + return false; + } + foreach ($suffixes as $suffixe) { + if ( (($position = Utf8::strrpos($this->word, $suffixe, $offset)) !== false) && ((Utf8::strlen($suffixe)+$position) == $length) ) { + return $position; + } + } + + return false; + } + + /** + * R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. + */ + protected function r1() + { + list($this->r1Index, $this->r1) = $this->rx($this->word); + } + + /** + * R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. + */ + protected function r2() + { + list($index, $value) = $this->rx($this->r1); + + $this->r2 = $value; + $this->r2Index = $this->r1Index + $index; + } + + /** + * Common function for R1 and R2 + * Search the region after the first non-vowel following a vowel in $word, or the end of the word if there is no such non-vowel. + * R1 : $in = $this->word + * R2 : $in = R1 + */ + protected function rx($in) + { + $length = Utf8::strlen($in); + + // defaults + $value = ''; + $index = $length; + + // we search all vowels + $vowels = array(); + for ($i=0; $i<$length; $i++) { + $letter = Utf8::substr($in, $i, 1); + if (in_array($letter, static::$vowels)) { + $vowels[] = $i; + } + } + + // search the non-vowel following a vowel + foreach ($vowels as $position) { + $after = $position + 1; + $letter = Utf8::substr($in, $after, 1); + + if (! in_array($letter, static::$vowels)) { + $index = $after + 1; + $value = Utf8::substr($in, ($after+1)); + + break; + } + } + + return array($index, $value); + } + + /** + * Used by spanish, italian, portuguese, etc (but not by french) + * + * If the second letter is a consonant, RV is the region after the next following vowel, + * or if the first two letters are vowels, RV is the region after the next consonant, + * and otherwise (consonant-vowel case) RV is the region after the third letter. + * But RV is the end of the word if these positions cannot be found. + */ + protected function rv() + { + $length = Utf8::strlen($this->word); + + $this->rv = ''; + $this->rvIndex = $length; + + if ($length < 3) { + return true; + } + + $first = Utf8::substr($this->word, 0, 1); + $second = Utf8::substr($this->word, 1, 1); + + // If the second letter is a consonant, RV is the region after the next following vowel, + if (!in_array($second, static::$vowels)) { + for ($i=2; $i<$length; $i++) { + $letter = Utf8::substr($this->word, $i, 1); + if (in_array($letter, static::$vowels)) { + $this->rvIndex = $i + 1; + $this->rv = Utf8::substr($this->word, ($i+1)); + return true; + } + } + } + + // or if the first two letters are vowels, RV is the region after the next consonant, + if ( (in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { + for ($i=2; $i<$length; $i++) { + $letter = Utf8::substr($this->word, $i, 1); + if (! in_array($letter, static::$vowels)) { + $this->rvIndex = $i + 1; + $this->rv = Utf8::substr($this->word, ($i+1)); + return true; + } + } + } + + // and otherwise (consonant-vowel case) RV is the region after the third letter. + if ( (! in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { + $this->rv = Utf8::substr($this->word, 3); + $this->rvIndex = 3; + return true; + } + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Stemmer.php b/libraries/vendor/wamania/php-stemmer/src/Stemmer.php new file mode 100644 index 00000000000..c013fa858e9 --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Stemmer.php @@ -0,0 +1,19 @@ + + */ +interface Stemmer +{ + /** + * Main function to get the STEM of a word + * + * @param string $word A valid UTF-8 word + * + * @return string + * + * @throws \Exception + */ + public function stem($word); +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Swedish.php b/libraries/vendor/wamania/php-stemmer/src/Swedish.php new file mode 100644 index 00000000000..41496976b1b --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Swedish.php @@ -0,0 +1,127 @@ +word = Utf8::strtolower($word); + + // R2 is not used: R1 is defined in the same way as in the German stemmer + $this->r1(); + + // then R1 is adjusted so that the region before it contains at least 3 letters. + if ($this->r1Index < 3) { + $this->r1Index = 3; + $this->r1 = Utf8::substr($this->word, 3); + } + + // Do each of steps 1, 2 3 and 4. + $this->step1(); + $this->step2(); + $this->step3(); + + return $this->word; + } + + /** + * Define a valid s-ending as one of + * b c d f g h j k l m n o p r t v y + * + * @param string $ending + * @return boolean + */ + private function hasValidSEnding($word) + { + $lastLetter = Utf8::substr($word, -1, 1); + return in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y')); + } + + /** + * Step 1 + * Search for the longest among the following suffixes in R1, and perform the action indicated. + */ + private function step1() + { + // a arna erna heterna orna ad e ade ande arne are aste en anden aren heten + // ern ar er heter or as arnas ernas ornas es ades andes ens arens hetens + // erns at andet het ast + // delete + if ( ($position = $this->searchIfInR1(array( + 'heterna', 'hetens', 'ornas', 'andes', 'arnas', 'heter', 'ernas', 'anden', 'heten', 'andet', 'arens', + 'orna', 'arna', 'erna', 'aren', 'ande', 'ades', 'arne', 'erns', 'aste', 'ade', 'ern', 'het', + 'ast', 'are', 'ens', 'or', 'es', 'ad', 'en', 'at', 'ar', 'as', 'er', 'a', 'e' + ))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + + // s + // delete if preceded by a valid s-ending + if ( ($position = $this->searchIfInR1(array('s'))) !== false) { + $word = Utf8::substr($this->word, 0, $position); + if ($this->hasValidSEnding($word)) { + $this->word = $word; + } + } + } + + /** + * Step 2 + * Search for one of the following suffixes in R1, and if found delete the last letter. + */ + private function step2() + { + // dd gd nn dt gt kt tt + if ($this->searchIfInR1(array('dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt')) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + } + } + + /** + * Step 3: + * Search for the longest among the following suffixes in R1, and perform the action indicated. + */ + private function step3() + { + // lig ig els + // delete + if ( ($position = $this->searchIfInR1(array('lig', 'ig', 'els'))) !== false) { + $this->word = Utf8::substr($this->word, 0, $position); + return true; + } + + // löst + // replace with lös + if ( ($this->searchIfInR1(array('löst'))) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + return true; + } + + // fullt + // replace with full + if ( ($this->searchIfInR1(array('fullt'))) !== false) { + $this->word = Utf8::substr($this->word, 0, -1); + return true; + } + } +} diff --git a/libraries/vendor/wamania/php-stemmer/src/Utf8.php b/libraries/vendor/wamania/php-stemmer/src/Utf8.php new file mode 100644 index 00000000000..e316c0d970d --- /dev/null +++ b/libraries/vendor/wamania/php-stemmer/src/Utf8.php @@ -0,0 +1,708 @@ + + * @package Stato + * @subpackage view + */ + +class Utf8 +{ + /** + * UTF-8 lookup table for lower case accented letters + * + * This lookuptable defines replacements for accented characters from the ASCII-7 + * range. This are lower case letters only. + * + * @author Andreas Gohr + * @see utf8_deaccent() + */ + private static $utf8_lower_accents = array( + 'à' => 'a', 'ô' => 'o', 'd' => 'd', '?' => 'f', 'ë' => 'e', 'š' => 's', 'o' => 'o', + 'ß' => 'ss', 'a' => 'a', 'r' => 'r', '?' => 't', 'n' => 'n', 'a' => 'a', 'k' => 'k', + 's' => 's', '?' => 'y', 'n' => 'n', 'l' => 'l', 'h' => 'h', '?' => 'p', 'ó' => 'o', + 'ú' => 'u', 'e' => 'e', 'é' => 'e', 'ç' => 'c', '?' => 'w', 'c' => 'c', 'õ' => 'o', + '?' => 's', 'ø' => 'o', 'g' => 'g', 't' => 't', '?' => 's', 'e' => 'e', 'c' => 'c', + 's' => 's', 'î' => 'i', 'u' => 'u', 'c' => 'c', 'e' => 'e', 'w' => 'w', '?' => 't', + 'u' => 'u', 'c' => 'c', 'ö' => 'oe', 'è' => 'e', 'y' => 'y', 'a' => 'a', 'l' => 'l', + 'u' => 'u', 'u' => 'u', 's' => 's', 'g' => 'g', 'l' => 'l', 'ƒ' => 'f', 'ž' => 'z', + '?' => 'w', '?' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', '?' => 'd', 't' => 't', + 'r' => 'r', 'ä' => 'ae', 'í' => 'i', 'r' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o', + 'e' => 'e', 'ñ' => 'n', 'n' => 'n', 'h' => 'h', 'g' => 'g', 'd' => 'd', 'j' => 'j', + 'ÿ' => 'y', 'u' => 'u', 'u' => 'u', 'u' => 'u', 't' => 't', 'ý' => 'y', 'o' => 'o', + 'â' => 'a', 'l' => 'l', '?' => 'w', 'z' => 'z', 'i' => 'i', 'ã' => 'a', 'g' => 'g', + '?' => 'm', 'o' => 'o', 'i' => 'i', 'ù' => 'u', 'i' => 'i', 'z' => 'z', 'á' => 'a', + 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', + ); + + /** + * UTF-8 Case lookup table + * + * This lookuptable defines the upper case letters to their correspponding + * lower case letter in UTF-8 + * + * @author Andreas Gohr + */ + private static $utf8_lower_to_upper = array( + 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, + 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, + 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, + 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, + 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, + 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, + 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, + 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, + 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, + 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, + 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, + 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, + 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, + 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, + 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, + 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, + 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, + 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, + 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, + 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, + 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, + 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, + 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, + 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, + 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, + 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, + 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, + 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, + 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, + 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, + 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, + 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, + 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, + 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, + 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, + 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, + 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, + 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, + 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, + 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, + 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, + 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, + 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, + ); + + /** + * UTF-8 Case lookup table + * + * This lookuptable defines the lower case letters to their correspponding + * upper case letter in UTF-8 (it does so by flipping $utf8_lower_to_upper) + * + * @author Andreas Gohr + */ + //private static $utf8_upper_to_lower = array_flip(self::$utf8_lower_to_upper); + + + /** + * UTF-8 lookup table for upper case accented letters + * + * This lookuptable defines replacements for accented characters from the ASCII-7 + * range. This are upper case letters only. + * + * @author Andreas Gohr + * @see utf8_deaccent() + */ + private static $utf8_upper_accents = array( + 'À' => 'A', 'Ô' => 'O', 'D' => 'D', '?' => 'F', 'Ë' => 'E', 'Š' => 'S', 'O' => 'O', + 'A' => 'A', 'R' => 'R', '?' => 'T', 'N' => 'N', 'A' => 'A', 'K' => 'K', + 'S' => 'S', '?' => 'Y', 'N' => 'N', 'L' => 'L', 'H' => 'H', '?' => 'P', 'Ó' => 'O', + 'Ú' => 'U', 'E' => 'E', 'É' => 'E', 'Ç' => 'C', '?' => 'W', 'C' => 'C', 'Õ' => 'O', + '?' => 'S', 'Ø' => 'O', 'G' => 'G', 'T' => 'T', '?' => 'S', 'E' => 'E', 'C' => 'C', + 'S' => 'S', 'Î' => 'I', 'U' => 'U', 'C' => 'C', 'E' => 'E', 'W' => 'W', '?' => 'T', + 'U' => 'U', 'C' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Y' => 'Y', 'A' => 'A', 'L' => 'L', + 'U' => 'U', 'U' => 'U', 'S' => 'S', 'G' => 'G', 'L' => 'L', 'ƒ' => 'F', 'Ž' => 'Z', + '?' => 'W', '?' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', '?' => 'D', 'T' => 'T', + 'R' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'R' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O', + 'E' => 'E', 'Ñ' => 'N', 'N' => 'N', 'H' => 'H', 'G' => 'G', 'Ð' => 'D', 'J' => 'J', + 'Ÿ' => 'Y', 'U' => 'U', 'U' => 'U', 'U' => 'U', 'T' => 'T', 'Ý' => 'Y', 'O' => 'O', + 'Â' => 'A', 'L' => 'L', '?' => 'W', 'Z' => 'Z', 'I' => 'I', 'Ã' => 'A', 'G' => 'G', + '?' => 'M', 'O' => 'O', 'I' => 'I', 'Ù' => 'U', 'I' => 'I', 'Z' => 'Z', 'Á' => 'A', + 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', + ); + + /** + * UTF-8 array of common special characters + * + * This array should contain all special characters (not a letter or digit) + * defined in the various local charsets - it's not a complete list of non-alphanum + * characters in UTF-8. It's not perfect but should match most cases of special + * chars. + * + * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is! + * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a + * + * @author Andreas Gohr + * @see utf8_stripspecials() + */ + private static $utf8_special_chars = array( + 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c, + 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b, + 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, + 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, + 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, + 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, + 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, + 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, + 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, + 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9, + 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384, + 0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1, + 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, + 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c, + 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651, + 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015, + 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022, + 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab, + 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193, + 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202, + 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212, + 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229, + 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265, + 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310, + 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, + 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553, + 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, + 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, + 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, + 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7, + 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702, + 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f, + 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719, + 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723, + 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e, + 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738, + 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742, + 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d, + 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c, + 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f, + 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e, + 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8, + 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3, + 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd, + 0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc, + 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6, + 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0, + 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa, + 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d, + ); + + /** + * URL-Encode a filename to allow unicodecharacters + * + * Slashes are not encoded + * + * When the second parameter is true the string will + * be encoded only if non ASCII characters are detected - + * This makes it safe to run it multiple times on the + * same string (default is true) + * + * @author Andreas Gohr + * @see urlencode + */ + public static function encode_fn($file,$safe=true) + { + if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ + return $file; + } + $file = urlencode($file); + $file = str_replace('%2F','/',$file); + return $file; + } + + /** + * URL-Decode a filename + * + * This is just a wrapper around urldecode + * + * @author Andreas Gohr + * @see urldecode + */ + public static function decode_fn($file) + { + $file = urldecode($file); + return $file; + } + + /** + * Checks if a string contains 7bit ASCII only + * + * @author Andreas Gohr + */ + public static function is_ascii($str) + { + for($i=0; $i127) return false; + } + return true; + } + + /** + * Strips all highbyte chars + * + * Returns a pure ASCII7 string + * + * @author Andreas Gohr + */ + public static function strip($str) + { + $ascii = ''; + for($i=0; $i + * @link http://www.php.net/manual/en/function.utf8-encode.php + */ + public static function check($str) + { + for ($i=0; $i + * @see strlen() + * @see utf8_decode() + */ + public static function strlen($string) + { + return strlen(utf8_decode($string)); + } + + /** + * Unicode aware replacement for substr() + * + * @author lmak at NOSPAM dot iti dot gr + * @link http://www.php.net/manual/en/function.substr.php + * @see substr() + */ + public static function substr($str,$start,$length=null) + { + $ar = array(); + preg_match_all("/./u", $str, $ar); + + if($length != null) { + return join("",array_slice($ar[0],$start,$length)); + } else { + return join("",array_slice($ar[0],$start)); + } + } + + /** + * Unicode aware replacement for substr_replace() + * + * @author Andreas Gohr + * @see substr_replace() + */ + public static function substr_replace($string, $replacement, $start , $length=null ) + { + $ret = ''; + if($start>0) $ret .= self::substr($string, 0, $start); + $ret .= $replacement; + if($length!=null) $ret .= self::substr($string, $start+$length); + return $ret; + } + + /** + * Unicode aware replacement for explode + * + * @TODO support third limit arg + * @author Harry Fuecks + * @see explode(); + */ + public static function explode($sep, $str) + { + if ( $sep == '' ) { + trigger_error('Empty delimiter',E_USER_WARNING); + return FALSE; + } + + return preg_split('!'.preg_quote($sep,'!').'!u',$str); + } + + /** + * Unicode aware replacement for strrepalce() + * + * @todo support PHP5 count (fourth arg) + * @author Harry Fuecks + * @see strreplace(); + */ + public static function str_replace($s,$r,$str) + { + if(!is_array($s)){ + $s = '!'.preg_quote($s,'!').'!u'; + }else{ + foreach ($s as $k => $v) { + $s[$k] = '!'.preg_quote($v).'!u'; + } + } + return preg_replace($s,$r,$str); + } + + /** + * Unicode aware replacement for ltrim() + * + * @author Andreas Gohr + * @see ltrim() + * @return string + */ + public static function ltrim($str,$charlist='') + { + if($charlist == '') return ltrim($str); + + //quote charlist for use in a characterclass + $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); + + return preg_replace('/^['.$charlist.']+/u','',$str); + } + + /** + * Unicode aware replacement for rtrim() + * + * @author Andreas Gohr + * @see rtrim() + * @return string + */ + public static function rtrim($str,$charlist='') + { + if($charlist == '') return rtrim($str); + + //quote charlist for use in a characterclass + $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist); + + return preg_replace('/['.$charlist.']+$/u','',$str); + } + + /** + * Unicode aware replacement for trim() + * + * @author Andreas Gohr + * @see trim() + * @return string + */ + public static function trim($str,$charlist='') + { + if($charlist == '') return trim($str); + + return self::ltrim(self::rtrim($str)); + } + + + /** + * This is a unicode aware replacement for strtolower() + * + * Uses mb_string extension if available + * + * @author Andreas Gohr + * @see strtolower() + * @see utf8_strtoupper() + */ + public static function strtolower($string) + { + if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) + return mb_strtolower($string,'utf-8'); + + //global $utf8_upper_to_lower; + $utf8_upper_to_lower = array_flip(self::$utf8_lower_to_upper); + $uni = self::utf8_to_unicode($string); + $cnt = count($uni); + for ($i=0; $i < $cnt; $i++){ + if($utf8_upper_to_lower[$uni[$i]]){ + $uni[$i] = $utf8_upper_to_lower[$uni[$i]]; + } + } + return self::unicode_to_utf8($uni); + } + + /** + * This is a unicode aware replacement for strtoupper() + * + * Uses mb_string extension if available + * + * @author Andreas Gohr + * @see strtoupper() + * @see utf8_strtoupper() + */ + public static function strtoupper($string) + { + if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) + return mb_strtoupper($string,'utf-8'); + + //global $utf8_lower_to_upper; + $uni = self::utf8_to_unicode($string); + $cnt = count($uni); + for ($i=0; $i < $cnt; $i++){ + if(self::$utf8_lower_to_upper[$uni[$i]]){ + $uni[$i] = self::$utf8_lower_to_upper[$uni[$i]]; + } + } + return self::unicode_to_utf8($uni); + } + + /** + * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents + * + * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) + * letters. Default is to deaccent both cases ($case = 0) + * + * @author Andreas Gohr + */ + public static function deaccent($string,$case=0) + { + if($case <= 0){ + //global $utf8_lower_accents; + $string = str_replace(array_keys(self::$utf8_lower_accents),array_values(self::$utf8_lower_accents),$string); + } + if($case >= 0){ + //global $utf8_upper_accents; + $string = str_replace(array_keys(self::$utf8_upper_accents),array_values(self::$utf8_upper_accents),$string); + } + return $string; + } + + /** + * Removes special characters (nonalphanumeric) from a UTF-8 string + * + * This function adds the controlchars 0x00 to 0x19 to the array of + * stripped chars (they are not included in $utf8_special_chars) + * + * @author Andreas Gohr + * @param string $string The UTF8 string to strip of special chars + * @param string $repl Replace special with this string + * @param string $additional Additional chars to strip (used in regexp char class) + */ + public static function stripspecials($string,$repl='',$additional='') + { + //global $utf8_special_chars; + + static $specials = null; + if(is_null($specials)){ + $specials = preg_quote(self::unicode_to_utf8(self::$utf8_special_chars), '/'); + } + + return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); + } + + /** + * This is an Unicode aware replacement for strpos + * + * Uses mb_string extension if available + * + * @author Harry Fuecks + * @see strpos() + */ + public static function strpos($haystack, $needle, $offset=0) + { + if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos')) + return mb_strpos($haystack,$needle,$offset,'utf-8'); + + if(!$offset){ + $ar = self::explode($needle, $haystack); + if ( count($ar) > 1 ) { + return self::strlen($ar[0]); + } + return false; + } else { + if ( !is_int($offset) ) { + trigger_error('Offset must be an integer',E_USER_WARNING); + return false; + } + + $str = self::substr($haystack, $offset); + + if ( false !== ($pos = self::strpos($str, $needle))){ + return $pos + $offset; + } + return false; + } + } + + /** + * This is an Unicode aware replacement for strrpos + * + * Uses mb_string extension if available + * + * @author Harry Fuecks + * @see strpos() + */ + public static function strrpos($haystack, $needle, $offset=0) + { + if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strrpos')) + return mb_strrpos($haystack, $needle, $offset, 'utf-8'); + + if (!$offset) { + $ar = self::explode($needle, $haystack); + $count = count($ar); + if ( $count > 1 ) { + return self::strlen($haystack) - self::strlen($ar[($count-1)]) - self::strlen($needle); + } + return false; + } else { + if ( !is_int($offset) ) { + trigger_error('Offset must be an integer', E_USER_WARNING); + return false; + } + + $str = self::substr($haystack, $offset); + + if ( false !== ($pos = self::strrpos($str, $needle))){ + return $pos + $offset; + } + return false; + } + } + + /** + * Encodes UTF-8 characters to HTML entities + * + * @author + * @link http://www.php.net/manual/en/function.utf8-decode.php + */ + public static function tohtml ($str) + { + $ret = ''; + $max = strlen($str); + $last = 0; // keeps the index of the last regular character + for ($i=0; $i<$max; $i++) { + $c = $str{$i}; + $c1 = ord($c); + if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode + $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed + $c1 &= 31; // remove the 3 bit two bytes prefix + $c2 = ord($str{++$i}); // the next byte + $c2 &= 63; // remove the 2 bit trailing byte prefix + $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2 + $c1 >>= 2; // c1 shifts 2 to the right + $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation + $last = $i+1; + } + } + return $ret . substr($str, $last, $i); // append the last batch of regular characters + } + + /** + * This function returns any UTF-8 encoded text as a list of + * Unicode values: + * + * @author Scott Michael Reynen + * @link http://www.randomchaos.com/document.php?source=php_and_unicode + * @see unicode_to_utf8() + */ + public static function utf8_to_unicode( &$str ) + { + $unicode = array(); + $values = array(); + $looking_for = 1; + + for ($i = 0; $i < strlen( $str ); $i++ ) { + $this_value = ord( $str[ $i ] ); + if ( $this_value < 128 ) $unicode[] = $this_value; + else { + if ( count( $values ) == 0 ) $looking_for = ( $this_value < 224 ) ? 2 : 3; + $values[] = $this_value; + if ( count( $values ) == $looking_for ) { + $number = ( $looking_for == 3 ) ? + ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ): + ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 ); + $unicode[] = $number; + $values = array(); + $looking_for = 1; + } + } + } + return $unicode; + } + + /** + * This function converts a Unicode array back to its UTF-8 representation + * + * @author Scott Michael Reynen + * @link http://www.randomchaos.com/document.php?source=php_and_unicode + * @see utf8_to_unicode() + */ + public static function unicode_to_utf8( &$str ) + { + if (!is_array($str)) return ''; + + $utf8 = ''; + foreach( $str as $unicode ) { + if ( $unicode < 128 ) { + $utf8.= chr( $unicode ); + } elseif ( $unicode < 2048 ) { + $utf8.= chr( 192 + ( ( $unicode - ( $unicode % 64 ) ) / 64 ) ); + $utf8.= chr( 128 + ( $unicode % 64 ) ); + } else { + $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) ); + $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) ); + $utf8.= chr( 128 + ( $unicode % 64 ) ); + } + } + return $utf8; + } + + /** + * UTF-8 to UTF-16BE conversion. + * + * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits + */ + public static function utf8_to_utf16be(&$str, $bom = false) + { + $out = $bom ? "\xFE\xFF" : ''; + if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding')) + return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8'); + + $uni = self::utf8_to_unicode($str); + foreach($uni as $cp){ + $out .= pack('n',$cp); + } + return $out; + } + + /** + * UTF-8 to UTF-16BE conversion. + * + * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits + */ + public static function utf16be_to_utf8(&$str) + { + $uni = unpack('n*',$str); + return self::unicode_to_utf8($uni); + } +}