mirror of
https://github.com/joomla/joomla-cms.git
synced 2024-06-24 22:39:31 +00:00
[4.0] Using language specific tokeniser and stemmer for com_finder (#20391)
This commit is contained in:
parent
79f0d5dd67
commit
ff3ed42c6e
|
@ -7,7 +7,7 @@ pipeline:
|
|||
image: joomlaprojects/docker-phpcs
|
||||
commands:
|
||||
- echo $(date)
|
||||
- /root/.composer/vendor/bin/phpcs --report=full --extensions=php -p --standard=build/phpcs/Joomla .
|
||||
- /root/.composer/vendor/bin/phpcs --report=full --extensions=php -p --encoding=utf-8 --standard=build/phpcs/Joomla .
|
||||
- echo $(date)
|
||||
|
||||
initdb:
|
||||
|
|
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -221,6 +221,11 @@ Desktop.ini
|
|||
/libraries/vendor/simplepie/simplepie/build
|
||||
/libraries/vendor/simplepie/simplepie/idn/ReadMe.txt
|
||||
/libraries/vendor/simplepie/simplepie/composer.json
|
||||
/libraries/vendor/wamania/php-stemmer/.gitignore
|
||||
/libraries/vendor/wamania/php-stemmer/README.md
|
||||
/libraries/vendor/wamania/php-stemmer/composer.json
|
||||
/libraries/vendor/wamania/php-stemmer/phpunit.xml.dist
|
||||
/libraries/vendor/wamania/php-stemmer/test
|
||||
/libraries/vendor/zendframework/zend-diactoros/.coveralls.yml
|
||||
/libraries/vendor/zendframework/zend-diactoros/CHANGELOG.md
|
||||
/libraries/vendor/zendframework/zend-diactoros/composer.json
|
||||
|
|
|
@ -269,29 +269,6 @@
|
|||
default="0.3"
|
||||
/>
|
||||
|
||||
<field
|
||||
name="stem"
|
||||
type="radio"
|
||||
label="COM_FINDER_CONFIG_STEMMER_ENABLE_LABEL"
|
||||
class="switcher"
|
||||
default="1"
|
||||
>
|
||||
<option value="0">JNO</option>
|
||||
<option value="1">JYES</option>
|
||||
</field>
|
||||
|
||||
<field
|
||||
name="stemmer"
|
||||
type="list"
|
||||
label="COM_FINDER_CONFIG_STEMMER_LABEL"
|
||||
default="snowball"
|
||||
showon="stem:1"
|
||||
>
|
||||
<option value="porter_en">COM_FINDER_CONFIG_STEMMER_PORTER_EN</option>
|
||||
<option value="fr">COM_FINDER_CONFIG_STEMMER_FR</option>
|
||||
<option value="snowball">COM_FINDER_CONFIG_STEMMER_SNOWBALL</option>
|
||||
</field>
|
||||
|
||||
<field
|
||||
name="enable_logging"
|
||||
type="radio"
|
||||
|
|
|
@ -14,8 +14,8 @@ use Joomla\CMS\Router\Router;
|
|||
use Joomla\Registry\Registry;
|
||||
use Joomla\String\StringHelper;
|
||||
|
||||
JLoader::register('FinderIndexerLanguage', __DIR__ . '/language.php');
|
||||
JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
|
||||
JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
|
||||
JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');
|
||||
|
||||
/**
|
||||
|
@ -25,23 +25,6 @@ JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');
|
|||
*/
|
||||
class FinderIndexerHelper
|
||||
{
|
||||
/**
|
||||
* The token stemmer object. The stemmer is set by whatever class
|
||||
* wishes to use it but it must be an instance of FinderIndexerStemmer.
|
||||
*
|
||||
* @var FinderIndexerStemmer
|
||||
* @since 2.5
|
||||
*/
|
||||
public static $stemmer;
|
||||
|
||||
/**
|
||||
* A state flag, in order to not constantly check if the stemmer is an instance of FinderIndexerStemmer
|
||||
*
|
||||
* @var boolean
|
||||
* @since 3.7.0
|
||||
*/
|
||||
protected static $stemmerOK;
|
||||
|
||||
/**
|
||||
* Method to parse input into plain text.
|
||||
*
|
||||
|
@ -73,82 +56,18 @@ class FinderIndexerHelper
|
|||
public static function tokenize($input, $lang, $phrase = false)
|
||||
{
|
||||
static $cache;
|
||||
$store = StringHelper::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null;
|
||||
$store = md5($input . '::' . $lang . '::' . $phrase);
|
||||
|
||||
// Check if the string has been tokenized already.
|
||||
if ($store && isset($cache[$store]))
|
||||
if (isset($cache[$store]))
|
||||
{
|
||||
return $cache[$store];
|
||||
}
|
||||
|
||||
$language = FinderIndexerLanguage::getInstance($lang);
|
||||
$tokens = array();
|
||||
$quotes = html_entity_decode('‘’'', ENT_QUOTES, 'UTF-8');
|
||||
|
||||
// Get the simple language key.
|
||||
$lang = static::getPrimaryLanguage($lang);
|
||||
|
||||
/*
|
||||
* Parsing the string input into terms is a multi-step process.
|
||||
*
|
||||
* Regexes:
|
||||
* 1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
|
||||
* 2. Remove plus, dash, period, and comma characters located before letter characters.
|
||||
* 3. Remove plus, dash, period, and comma characters located after other characters.
|
||||
* 4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
|
||||
* 5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
|
||||
* 6. Remove orphaned quote characters.
|
||||
* 7. Replace the assorted single quotation marks with the ASCII standard single quotation.
|
||||
* 8. Remove multiple space characters and replaces with a single space.
|
||||
*/
|
||||
$input = StringHelper::strtolower($input);
|
||||
$input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input);
|
||||
$input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input);
|
||||
$input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input);
|
||||
$input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input);
|
||||
$input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input);
|
||||
$input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input);
|
||||
$input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input);
|
||||
$input = preg_replace('#\s+#mui', ' ', $input);
|
||||
$input = trim($input);
|
||||
|
||||
// Explode the normalized string to get the terms.
|
||||
$terms = explode(' ', $input);
|
||||
|
||||
/*
|
||||
* If we have Unicode support and are dealing with Chinese text, Chinese
|
||||
* has to be handled specially because there are not necessarily any spaces
|
||||
* between the "words". So, we have to test if the words belong to the Chinese
|
||||
* character set and if so, explode them into single glyphs or "words".
|
||||
*/
|
||||
if ($lang === 'zh')
|
||||
{
|
||||
// Iterate through the terms and test if they contain Chinese.
|
||||
for ($i = 0, $n = count($terms); $i < $n; $i++)
|
||||
{
|
||||
$charMatches = array();
|
||||
$charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
|
||||
|
||||
// Split apart any groups of Chinese characters.
|
||||
for ($j = 0; $j < $charCount; $j++)
|
||||
{
|
||||
$tSplit = StringHelper::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
|
||||
|
||||
if (!empty($tSplit))
|
||||
{
|
||||
$terms[$i] = $tSplit;
|
||||
}
|
||||
else
|
||||
{
|
||||
unset($terms[$i]);
|
||||
}
|
||||
|
||||
$terms[] = $charMatches[0][$j];
|
||||
}
|
||||
}
|
||||
|
||||
// Reset array keys.
|
||||
$terms = array_values($terms);
|
||||
}
|
||||
$terms = $language->tokenise($input);
|
||||
$terms = array_filter($terms);
|
||||
|
||||
/*
|
||||
* If we have to handle the input as a phrase, that means we don't
|
||||
|
@ -158,14 +77,14 @@ class FinderIndexerHelper
|
|||
if ($phrase === true && count($terms) > 1)
|
||||
{
|
||||
// Create tokens from the phrase.
|
||||
$tokens[] = new FinderIndexerToken($terms, $lang);
|
||||
$tokens[] = new FinderIndexerToken($terms, $language->language, $language->spacer);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Create tokens from the terms.
|
||||
for ($i = 0, $n = count($terms); $i < $n; $i++)
|
||||
{
|
||||
$tokens[] = new FinderIndexerToken($terms[$i], $lang);
|
||||
$tokens[] = new FinderIndexerToken($terms[$i], $language->language);
|
||||
}
|
||||
|
||||
// Create two and three word phrase tokens from the individual words.
|
||||
|
@ -179,7 +98,7 @@ class FinderIndexerHelper
|
|||
if ($i2 < $n && isset($tokens[$i2]))
|
||||
{
|
||||
// Tokenize the two word phrase.
|
||||
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' ');
|
||||
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $language->language, $language->spacer);
|
||||
$token->derived = true;
|
||||
|
||||
// Add the token to the stack.
|
||||
|
@ -190,7 +109,7 @@ class FinderIndexerHelper
|
|||
if ($i3 < $n && isset($tokens[$i3]))
|
||||
{
|
||||
// Tokenize the three word phrase.
|
||||
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' ');
|
||||
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $language->language, $language->spacer);
|
||||
$token->derived = true;
|
||||
|
||||
// Add the token to the stack.
|
||||
|
@ -199,22 +118,13 @@ class FinderIndexerHelper
|
|||
}
|
||||
}
|
||||
|
||||
if ($store)
|
||||
{
|
||||
$cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens);
|
||||
$cache[$store] = $tokens;
|
||||
|
||||
return $cache[$store];
|
||||
}
|
||||
else
|
||||
{
|
||||
return count($tokens) > 1 ? $tokens : array_shift($tokens);
|
||||
}
|
||||
return $cache[$store];
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to get the base word of a token. This method uses the public
|
||||
* {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set,
|
||||
* the original token is returned.
|
||||
* Method to get the base word of a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
* @param string $lang The language of the token.
|
||||
|
@ -225,31 +135,9 @@ class FinderIndexerHelper
|
|||
*/
|
||||
public static function stem($token, $lang)
|
||||
{
|
||||
// Trim apostrophes at either end of the token.
|
||||
$token = trim($token, '\'');
|
||||
$language = FinderIndexerLanguage::getInstance($lang);
|
||||
|
||||
// Trim everything after any apostrophe in the token.
|
||||
if ($res = explode('\'', $token))
|
||||
{
|
||||
$token = $res[0];
|
||||
}
|
||||
|
||||
if (static::$stemmerOK === true)
|
||||
{
|
||||
return static::$stemmer->stem($token, $lang);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Stem the token if we have a valid stemmer to use.
|
||||
if (static::$stemmer instanceof FinderIndexerStemmer)
|
||||
{
|
||||
static::$stemmerOK = true;
|
||||
|
||||
return static::$stemmer->stem($token, $lang);
|
||||
}
|
||||
}
|
||||
|
||||
return $token;
|
||||
return $language->stem($token);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -12,8 +12,8 @@ defined('_JEXEC') or die;
|
|||
use Joomla\String\StringHelper;
|
||||
|
||||
JLoader::register('FinderIndexerHelper', __DIR__ . '/helper.php');
|
||||
JLoader::register('FinderIndexerLanguage', __DIR__ . '/language.php');
|
||||
JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
|
||||
JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
|
||||
JLoader::register('FinderIndexerTaxonomy', __DIR__ . '/taxonomy.php');
|
||||
JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');
|
||||
|
||||
|
@ -213,12 +213,6 @@ abstract class FinderIndexer
|
|||
static::$profiler = JProfiler::getInstance('FinderIndexer');
|
||||
}
|
||||
|
||||
// Setup the stemmer.
|
||||
if ($data->options->get('stem', 1) && $data->options->get('stemmer', 'porter_en'))
|
||||
{
|
||||
FinderIndexerHelper::$stemmer = FinderIndexerStemmer::getInstance($data->options->get('stemmer', 'porter_en'));
|
||||
}
|
||||
|
||||
// Set the state.
|
||||
static::$state = $data;
|
||||
|
||||
|
@ -471,6 +465,11 @@ abstract class FinderIndexer
|
|||
// Tokenize the input.
|
||||
$tokens = FinderIndexerHelper::tokenize($input, $lang);
|
||||
|
||||
if (count($tokens) == 0)
|
||||
{
|
||||
return $count;
|
||||
}
|
||||
|
||||
// Add the tokens to the database.
|
||||
$count += $this->addTokensToDb($tokens, $context);
|
||||
|
||||
|
|
146
administrator/components/com_finder/helpers/indexer/language.php
Normal file
146
administrator/components/com_finder/helpers/indexer/language.php
Normal file
|
@ -0,0 +1,146 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
use Joomla\String\StringHelper;
|
||||
|
||||
/**
|
||||
* Language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language support instances container.
|
||||
*
|
||||
* @var FinderIndexerLanguage[]
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $instances = array();
|
||||
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language;
|
||||
|
||||
/**
|
||||
* Spacer to use between terms
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $spacer = ' ';
|
||||
|
||||
/**
|
||||
* Method to get a language support object.
|
||||
*
|
||||
* @param string $language The language of the support object.
|
||||
*
|
||||
* @return FinderIndexerLanguage A FinderIndexerLanguage instance.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public static function getInstance($language)
|
||||
{
|
||||
if (isset($instances[$language]))
|
||||
{
|
||||
return $instances[$language];
|
||||
}
|
||||
|
||||
if ($language == '*')
|
||||
{
|
||||
$instances[$language] = new FinderIndexerLanguage;
|
||||
|
||||
return $instances[$language];
|
||||
}
|
||||
|
||||
$locale = FinderIndexerHelper::getPrimaryLanguage($language);
|
||||
$class = 'FinderIndexerLanguage' . $locale;
|
||||
$path = __DIR__ . '/language/' . $locale . '.php';
|
||||
|
||||
if (is_file($path))
|
||||
{
|
||||
JLoader::register($class, $path);
|
||||
}
|
||||
|
||||
if (class_exists($class))
|
||||
{
|
||||
$instances[$language] = new $class;
|
||||
}
|
||||
else
|
||||
{
|
||||
$instances[$language] = new FinderIndexerLanguage;
|
||||
$instances[$language]->language = $locale;
|
||||
}
|
||||
|
||||
return $instances[$language];
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to tokenise a text string.
|
||||
*
|
||||
* @param string $input The input to tokenise.
|
||||
*
|
||||
* @return array An array of term strings.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function tokenise($input)
|
||||
{
|
||||
$quotes = html_entity_decode('‘’'', ENT_QUOTES, 'UTF-8');
|
||||
|
||||
/*
|
||||
* Parsing the string input into terms is a multi-step process.
|
||||
*
|
||||
* Regexes:
|
||||
* 1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
|
||||
* 2. Remove plus, dash, period, and comma characters located before letter characters.
|
||||
* 3. Remove plus, dash, period, and comma characters located after other characters.
|
||||
* 4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
|
||||
* 5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
|
||||
* 6. Remove orphaned quote characters.
|
||||
* 7. Replace the assorted single quotation marks with the ASCII standard single quotation.
|
||||
* 8. Remove multiple space characters and replaces with a single space.
|
||||
*/
|
||||
$input = StringHelper::strtolower($input);
|
||||
$input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input);
|
||||
$input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input);
|
||||
$input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input);
|
||||
$input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input);
|
||||
$input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input);
|
||||
$input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input);
|
||||
$input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input);
|
||||
$input = preg_replace('#\s+#mui', ' ', $input);
|
||||
$input = trim($input);
|
||||
|
||||
// Explode the normalized string to get the terms.
|
||||
$terms = explode(' ', $input);
|
||||
|
||||
return $terms;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $token;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Danish language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguageda extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'da';
|
||||
|
||||
/**
|
||||
* The danish stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\Danish
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\Danish;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* German language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguagede extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'de';
|
||||
|
||||
/**
|
||||
* The german stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\German
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\German;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
1015
administrator/components/com_finder/helpers/indexer/language/el.php
Normal file
1015
administrator/components/com_finder/helpers/indexer/language/el.php
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* English language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguageen extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'en';
|
||||
|
||||
/**
|
||||
* The english stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\English
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\English;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Spanish language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguagees extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'es';
|
||||
|
||||
/**
|
||||
* The spanish stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\Spanish
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\Spanish;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* French language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguagefr extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'fr';
|
||||
|
||||
/**
|
||||
* The french stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\French
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\French;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Italian language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguageit extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'it';
|
||||
|
||||
/**
|
||||
* The italian stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\Italian
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\Italian;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Dutch language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguagenl extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'nl';
|
||||
|
||||
/**
|
||||
* The dutch stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\Dutch
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\Dutch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Norwegian language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguagenn extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'nn';
|
||||
|
||||
/**
|
||||
* The norwegian stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\Norwegian
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\Norwegian;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Portuguese language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguagept extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'pt';
|
||||
|
||||
/**
|
||||
* The portuguese stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\Portuguese
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\Portuguese;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Romanian language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguagero extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'ro';
|
||||
|
||||
/**
|
||||
* The romanian stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\Romanian
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\Romanian;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Russian language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguageru extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'ru';
|
||||
|
||||
/**
|
||||
* The russian stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\Russian
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\Russian;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Swedish language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguagesv extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'sv';
|
||||
|
||||
/**
|
||||
* The swedish stemmer object.
|
||||
*
|
||||
* @var \Wamania\Snowball\Swedish
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
protected $stemmer = null;
|
||||
|
||||
/**
|
||||
* Method to construct the language object.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmer = new \Wamania\Snowball\Swedish;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
*
|
||||
* @return string The stemmed token.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function stem($token)
|
||||
{
|
||||
return $this->stemmer->stem($token);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
use Joomla\String\StringHelper;
|
||||
|
||||
/**
|
||||
* Chinese (simplified) language support class for the Finder indexer package.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
class FinderIndexerLanguagezh extends FinderIndexerLanguage
|
||||
{
|
||||
/**
|
||||
* Language locale of the class
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $language = 'zh';
|
||||
|
||||
/**
|
||||
* Spacer between terms
|
||||
*
|
||||
* @var string
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $spacer = '';
|
||||
|
||||
/**
|
||||
* Method to tokenise a text string.
|
||||
*
|
||||
* @param string $input The input to tokenise.
|
||||
*
|
||||
* @return array An array of term strings.
|
||||
*
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public function tokenise($input)
|
||||
{
|
||||
$terms = parent::tokenise($input);
|
||||
|
||||
// Iterate through the terms and test if they contain Chinese.
|
||||
for ($i = 0, $n = count($terms); $i < $n; $i++)
|
||||
{
|
||||
$charMatches = array();
|
||||
$charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
|
||||
|
||||
// Split apart any groups of Chinese characters.
|
||||
for ($j = 0; $j < $charCount; $j++)
|
||||
{
|
||||
$tSplit = StringHelper::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
|
||||
if (!empty($tSplit))
|
||||
{
|
||||
$terms[$i] = $tSplit;
|
||||
}
|
||||
else
|
||||
{
|
||||
unset($terms[$i]);
|
||||
}
|
||||
$terms[] = $charMatches[0][$j];
|
||||
}
|
||||
}
|
||||
|
||||
return $terms;
|
||||
}
|
||||
}
|
|
@ -106,6 +106,14 @@ class FinderIndexerQuery
|
|||
*/
|
||||
public $terms;
|
||||
|
||||
/**
|
||||
* Allow empty searches
|
||||
*
|
||||
* @var boolean
|
||||
* @since __DEPLOY_VERSION__
|
||||
*/
|
||||
public $empty;
|
||||
|
||||
/**
|
||||
* The static filter id.
|
||||
*
|
||||
|
@ -180,7 +188,6 @@ class FinderIndexerQuery
|
|||
|
||||
// Get the input language.
|
||||
$this->language = !empty($options['language']) ? $options['language'] : FinderIndexerHelper::getDefaultLanguage();
|
||||
$this->language = FinderIndexerHelper::getPrimaryLanguage($this->language);
|
||||
|
||||
// Get the matching mode.
|
||||
$this->mode = 'AND';
|
||||
|
@ -995,7 +1002,7 @@ class FinderIndexerQuery
|
|||
{
|
||||
// Tokenize the current term.
|
||||
$token = FinderIndexerHelper::tokenize($terms[$i], $lang, true);
|
||||
$token = $this->getTokenData($token);
|
||||
$token = $this->getTokenData(array_shift($token));
|
||||
|
||||
// Set the required flag.
|
||||
$token->required = true;
|
||||
|
@ -1009,7 +1016,7 @@ class FinderIndexerQuery
|
|||
|
||||
// Tokenize the term after the next term (current plus two).
|
||||
$other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true);
|
||||
$other = $this->getTokenData($other);
|
||||
$other = $this->getTokenData(array_shift($other));
|
||||
|
||||
// Set the required flag.
|
||||
$other->required = true;
|
||||
|
@ -1147,7 +1154,7 @@ class FinderIndexerQuery
|
|||
|
||||
// Tokenize the next term (current plus one).
|
||||
$other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true);
|
||||
$other = $this->getTokenData($other);
|
||||
$other = $this->getTokenData(array_shift($other));
|
||||
|
||||
// Set the required flag.
|
||||
$other->required = false;
|
||||
|
@ -1187,7 +1194,7 @@ class FinderIndexerQuery
|
|||
{
|
||||
// Tokenize the phrase.
|
||||
$token = FinderIndexerHelper::tokenize($phrases[$i], $lang, true);
|
||||
$token = $this->getTokenData($token);
|
||||
$token = $this->getTokenData(array_shift($token));
|
||||
|
||||
// Set the required flag.
|
||||
$token->required = true;
|
||||
|
|
|
@ -1,83 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Stemmer base class for the Finder indexer package.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
abstract class FinderIndexerStemmer
|
||||
{
|
||||
/**
|
||||
* An internal cache of stemmed tokens.
|
||||
*
|
||||
* @var array
|
||||
* @since 2.5
|
||||
*/
|
||||
public $cache = array();
|
||||
|
||||
/**
|
||||
* Method to get a stemmer, creating it if necessary.
|
||||
*
|
||||
* @param string $adapter The type of stemmer to load.
|
||||
*
|
||||
* @return FinderIndexerStemmer A FinderIndexerStemmer instance.
|
||||
*
|
||||
* @since 2.5
|
||||
* @throws Exception on invalid stemmer.
|
||||
*/
|
||||
public static function getInstance($adapter)
|
||||
{
|
||||
static $instances;
|
||||
|
||||
// Only create one stemmer for each adapter.
|
||||
if (isset($instances[$adapter]))
|
||||
{
|
||||
return $instances[$adapter];
|
||||
}
|
||||
|
||||
// Create an array of instances if necessary.
|
||||
if (!is_array($instances))
|
||||
{
|
||||
$instances = array();
|
||||
}
|
||||
|
||||
// Setup the adapter for the stemmer.
|
||||
$adapter = JFilterInput::getInstance()->clean($adapter, 'cmd');
|
||||
$path = __DIR__ . '/stemmer/' . $adapter . '.php';
|
||||
$class = 'FinderIndexerStemmer' . ucfirst($adapter);
|
||||
|
||||
// Check if a stemmer exists for the adapter.
|
||||
if (!file_exists($path))
|
||||
{
|
||||
// Throw invalid adapter exception.
|
||||
throw new Exception(JText::sprintf('COM_FINDER_INDEXER_INVALID_STEMMER', $adapter));
|
||||
}
|
||||
|
||||
// Instantiate the stemmer.
|
||||
JLoader::register($class, $path);
|
||||
$instances[$adapter] = new $class;
|
||||
|
||||
return $instances[$adapter];
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to stem a token and return the root.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
* @param string $lang The language of the token.
|
||||
*
|
||||
* @return string The root token.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
abstract public function stem($token, $lang);
|
||||
}
|
|
@ -1,265 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
JLoader::register('FinderIndexerStemmer', dirname(__DIR__) . '/stemmer.php');
|
||||
|
||||
/**
|
||||
* French stemmer class for Smart Search indexer.
|
||||
*
|
||||
* First contributed by Eric Sanou (bobotche@hotmail.fr)
|
||||
* This class is inspired in Alexis Ulrich's French stemmer code (http://alx2002.free.fr)
|
||||
*
|
||||
* @since 3.0
|
||||
*/
|
||||
class FinderIndexerStemmerFr extends FinderIndexerStemmer
|
||||
{
|
||||
/**
|
||||
* Stemming rules.
|
||||
*
|
||||
* @var array
|
||||
* @since 3.0
|
||||
*/
|
||||
private static $stemRules = null;
|
||||
|
||||
/**
|
||||
* Method to stem a token and return the root.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
* @param string $lang The language of the token.
|
||||
*
|
||||
* @return string The root token.
|
||||
*
|
||||
* @since 3.0
|
||||
*/
|
||||
public function stem($token, $lang)
|
||||
{
|
||||
// Check if the token is long enough to merit stemming.
|
||||
if (strlen($token) <= 2)
|
||||
{
|
||||
return $token;
|
||||
}
|
||||
|
||||
// Check if the language is French or All.
|
||||
if ($lang !== 'fr' && $lang != '*')
|
||||
{
|
||||
return $token;
|
||||
}
|
||||
|
||||
// Stem the token if it is not in the cache.
|
||||
if (!isset($this->cache[$lang][$token]))
|
||||
{
|
||||
// Stem the token.
|
||||
$result = self::getStem($token);
|
||||
|
||||
// Add the token to the cache.
|
||||
$this->cache[$lang][$token] = $result;
|
||||
}
|
||||
|
||||
return $this->cache[$lang][$token];
|
||||
}
|
||||
|
||||
/**
|
||||
* French stemmer rules variables.
|
||||
*
|
||||
* @return array The rules
|
||||
*
|
||||
* @since 3.0
|
||||
*/
|
||||
protected static function getStemRules()
|
||||
{
|
||||
if (self::$stemRules)
|
||||
{
|
||||
return self::$stemRules;
|
||||
}
|
||||
|
||||
$vars = array();
|
||||
|
||||
// French accented letters in ISO-8859-1 encoding
|
||||
$vars['accents'] = chr(224) . chr(226) . chr(232) . chr(233) . chr(234) . chr(235) . chr(238) . chr(239)
|
||||
. chr(244) . chr(251) . chr(249) . chr(231);
|
||||
|
||||
// The rule patterns include all accented words for french language
|
||||
$vars['rule_pattern'] = '/^([a-z' . $vars['accents'] . ']*)(\*){0,1}(\d)([a-z' . $vars['accents'] . ']*)([.|>])/';
|
||||
|
||||
// French vowels (including y) in ISO-8859-1 encoding
|
||||
$vars['vowels'] = chr(97) . chr(224) . chr(226) . chr(101) . chr(232) . chr(233) . chr(234) . chr(235)
|
||||
. chr(105) . chr(238) . chr(239) . chr(111) . chr(244) . chr(117) . chr(251) . chr(249) . chr(121);
|
||||
|
||||
// The French rules in ISO-8859-1 encoding
|
||||
$vars['rules'] = array(
|
||||
'esre1>', 'esio1>', 'siol1.', 'siof0.', 'sioe0.', 'sio3>', 'st1>', 'sf1>', 'sle1>', 'slo1>', 's' . chr(233) . '1>', chr(233) . 'tuae5.',
|
||||
chr(233) . 'tuae2.', 'tnia0.', 'tniv1.', 'tni3>', 'suor1.', 'suo0.', 'sdrail5.', 'sdrai4.', 'er' . chr(232) . 'i1>', 'sesue3x>',
|
||||
'esuey5i.', 'esue2x>', 'se1>', 'er' . chr(232) . 'g3.', 'eca1>', 'esiah0.', 'esi1>', 'siss2.', 'sir2>', 'sit2>', 'egan' . chr(233) . '1.',
|
||||
'egalli6>', 'egass1.', 'egas0.', 'egat3.', 'ega3>', 'ette4>', 'ett2>', 'etio1.', 'tio' . chr(231) . '4c.', 'tio0.', 'et1>', 'eb1>',
|
||||
'snia1>', 'eniatnau8>', 'eniatn4.', 'enia1>', 'niatnio3.', 'niatg3.', 'e' . chr(233) . '1>', chr(233) . 'hcat1.', chr(233) . 'hca4.',
|
||||
chr(233) . 'tila5>', chr(233) . 'tici5.', chr(233) . 'tir1.', chr(233) . 'ti3>', chr(233) . 'gan1.', chr(233) . 'ga3>',
|
||||
chr(233) . 'tehc1.', chr(233) . 'te3>', chr(233) . 'it0.', chr(233) . '1>', 'eire4.', 'eirue5.', 'eio1.', 'eia1.', 'ei1>', 'eng1.',
|
||||
'xuaessi7.', 'xuae1>', 'uaes0.', 'uae3.', 'xuave2l.', 'xuav2li>', 'xua3la>', 'ela1>', 'lart2.', 'lani2>', 'la' . chr(233) . '2>',
|
||||
'siay4i.', 'siassia7.', 'siarv1*.', 'sia1>', 'tneiayo6i.', 'tneiay6i.', 'tneiassia9.', 'tneiareio7.', 'tneia5>', 'tneia4>', 'tiario4.',
|
||||
'tiarim3.', 'tiaria3.', 'tiaris3.', 'tiari5.', 'tiarve6>', 'tiare5>', 'iare4>', 'are3>', 'tiay4i.', 'tia3>', 'tnay4i.',
|
||||
'em' . chr(232) . 'iu5>', 'em' . chr(232) . 'i4>', 'tnaun3.', 'tnauqo3.', 'tnau4>', 'tnaf0.', 'tnat' . chr(233) . '2>', 'tna3>', 'tno3>',
|
||||
'zeiy4i.', 'zey3i.', 'zeire5>', 'zeird4.', 'zeirio4.', 'ze2>', 'ssiab0.', 'ssia4.', 'ssi3.', 'tnemma6>', 'tnemesuey9i.', 'tnemesue8>',
|
||||
'tnemevi7.', 'tnemessia5.', 'tnemessi8.', 'tneme5>', 'tnemia4.', 'tnem' . chr(233) . '5>', 'el2l>', 'lle3le>', 'let' . chr(244) . '0.',
|
||||
'lepp0.', 'le2>', 'srei1>', 'reit3.', 'reila2.', 'rei3>', 'ert' . chr(226) . 'e5.', 'ert' . chr(226) . chr(233) . '1.',
|
||||
'ert' . chr(226) . '4.', 'drai4.', 'erdro0.', 'erute5.', 'ruta0.', 'eruta1.', 'erutiov1.', 'erub3.', 'eruh3.', 'erul3.', 'er2r>', 'nn1>',
|
||||
'r' . chr(232) . 'i3.', 'srev0.', 'sr1>', 'rid2>', 're2>', 'xuei4.', 'esuei5.', 'lbati3.', 'lba3>', 'rueis0.', 'ruehcn4.', 'ecirta6.',
|
||||
'ruetai6.', 'rueta5.', 'rueir0.', 'rue3>', 'esseti6.', 'essere6>', 'esserd1.', 'esse4>', 'essiab1.', 'essia5.', 'essio1.', 'essi4.',
|
||||
'essal4.', 'essa1>', 'ssab1.', 'essurp1.', 'essu4.', 'essi1.', 'ssor1.', 'essor2.', 'esso1>', 'ess2>', 'tio3.', 'r' . chr(232) . 's2re.',
|
||||
'r' . chr(232) . '0e.', 'esn1.', 'eu1>', 'sua0.', 'su1>', 'utt1>', 'tu' . chr(231) . '3c.', 'u' . chr(231) . '2c.', 'ur1.', 'ehcn2>',
|
||||
'ehcu1>', 'snorr3.', 'snoru3.', 'snorua3.', 'snorv3.', 'snorio4.', 'snori5.', 'snore5>', 'snortt4>', 'snort' . chr(238) . 'a7.', 'snort3.',
|
||||
'snor4.', 'snossi6.', 'snoire6.', 'snoird5.', 'snoitai7.', 'snoita6.', 'snoits1>', 'noits0.', 'snoi4>', 'noitaci7>', 'noitai6.', 'noita5.',
|
||||
'noitu4.', 'noi3>', 'snoya0.', 'snoy4i.', 'sno' . chr(231) . 'a1.', 'sno' . chr(231) . 'r1.', 'snoe4.', 'snosiar1>', 'snola1.', 'sno3>',
|
||||
'sno1>', 'noll2.', 'tnennei4.', 'ennei2>', 'snei1>', 'sne' . chr(233) . '1>', 'enne' . chr(233) . '5e.', 'ne' . chr(233) . '3e.', 'neic0.',
|
||||
'neiv0.', 'nei3.', 'sc1.', 'sd1.', 'sg1.', 'sni1.', 'tiu0.', 'ti2.', 'sp1>', 'sna1>', 'sue1.', 'enn2>', 'nong2.', 'noss2.', 'rioe4.',
|
||||
'riot0.', 'riorc1.', 'riovec5.', 'rio3.', 'ric2.', 'ril2.', 'tnerim3.', 'tneris3>', 'tneri5.', 't' . chr(238) . 'a3.', 'riss2.',
|
||||
't' . chr(238) . '2.', 't' . chr(226) . '2>', 'ario2.', 'arim1.', 'ara1.', 'aris1.', 'ari3.', 'art1>', 'ardn2.', 'arr1.', 'arua1.',
|
||||
'aro1.', 'arv1.', 'aru1.', 'ar2.', 'rd1.', 'ud1.', 'ul1.', 'ini1.', 'rin2.', 'tnessiab3.', 'tnessia7.', 'tnessi6.', 'tnessni4.', 'sini2.',
|
||||
'sl1.', 'iard3.', 'iario3.', 'ia2>', 'io0.', 'iule2.', 'i1>', 'sid2.', 'sic2.', 'esoi4.', 'ed1.', 'ai2>', 'a1>', 'adr1.',
|
||||
'tner' . chr(232) . '5>', 'evir1.', 'evio4>', 'evi3.', 'fita4.', 'fi2>', 'enie1.', 'sare4>', 'sari4>', 'sard3.', 'sart2>', 'sa2.',
|
||||
'tnessa6>', 'tnessu6>', 'tnegna3.', 'tnegi3.', 'tneg0.', 'tneru5>', 'tnemg0.', 'tnerni4.', 'tneiv1.', 'tne3>', 'une1.', 'en1>', 'nitn2.',
|
||||
'ecnay5i.', 'ecnal1.', 'ecna4.', 'ec1>', 'nn1.', 'rit2>', 'rut2>', 'rud2.', 'ugn1>', 'eg1>', 'tuo0.', 'tul2>', 't' . chr(251) . '2>',
|
||||
'ev1>', 'v' . chr(232) . '2ve>', 'rtt1>', 'emissi6.', 'em1.', 'ehc1.', 'c' . chr(233) . 'i2c' . chr(232) . '.', 'libi2l.', 'llie1.',
|
||||
'liei4i.', 'xuev1.', 'xuey4i.', 'xueni5>', 'xuell4.', 'xuere5.', 'xue3>', 'rb' . chr(233) . '3rb' . chr(232) . '.', 'tur2.',
|
||||
'rir' . chr(233) . '4re.', 'rir2.', 'c' . chr(226) . '2ca.', 'snu1.', 'rt' . chr(238) . 'a4.', 'long2.', 'vec2.', chr(231) . '1c>',
|
||||
'ssilp3.', 'silp2.', 't' . chr(232) . 'hc2te.', 'n' . chr(232) . 'm2ne.', 'llepp1.', 'tan2.', 'rv' . chr(232) . '3rve.',
|
||||
'rv' . chr(233) . '3rve.', 'r' . chr(232) . '2re.', 'r' . chr(233) . '2re.', 't' . chr(232) . '2te.', 't' . chr(233) . '2te.', 'epp1.',
|
||||
'eya2i.', 'ya1i.', 'yo1i.', 'esu1.', 'ugi1.', 'tt1.', 'end0.'
|
||||
);
|
||||
|
||||
self::$stemRules = $vars;
|
||||
|
||||
return self::$stemRules;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of the first rule from the rule number
|
||||
* that can be applied to the given reversed input.
|
||||
* returns -1 if no rule can be applied, ie the stem has been found
|
||||
*
|
||||
* @param string $reversed_input The input to check in reversed order
|
||||
* @param integer $rule_number The rule number to check
|
||||
*
|
||||
* @return integer Number of the first rule
|
||||
*
|
||||
* @since 3.0
|
||||
*/
|
||||
private static function getFirstRule($reversed_input, $rule_number)
|
||||
{
|
||||
$vars = static::getStemRules();
|
||||
|
||||
$nb_rules = count($vars['rules']);
|
||||
|
||||
for ($i = $rule_number; $i < $nb_rules; $i++)
|
||||
{
|
||||
// Gets the letters from the current rule
|
||||
$rule = $vars['rules'][$i];
|
||||
$rule = preg_replace($vars['rule_pattern'], "\\1", $rule);
|
||||
|
||||
if (strncasecmp(utf8_decode($rule), $reversed_input, strlen(utf8_decode($rule))) == 0)
|
||||
{
|
||||
return $i;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check the acceptability of a stem for French language
|
||||
*
|
||||
* @param string $reversed_stem The stem to check in reverse form
|
||||
*
|
||||
* @return boolean True if stem is acceptable
|
||||
*
|
||||
* @since 3.0
|
||||
*/
|
||||
private static function check($reversed_stem)
|
||||
{
|
||||
$vars = static::getStemRules();
|
||||
|
||||
if (preg_match('/[' . $vars['vowels'] . ']$/', utf8_encode($reversed_stem)))
|
||||
{
|
||||
// If the form starts with a vowel then at least two letters must remain after stemming (e.g.: "etaient" --> "et")
|
||||
return (strlen($reversed_stem) > 2);
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the reversed stem starts with a consonant then at least two letters must remain after stemming
|
||||
if (strlen($reversed_stem) <= 2)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// And at least one of these must be a vowel or "y"
|
||||
return preg_match('/[' . $vars['vowels'] . ']/', utf8_encode($reversed_stem));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Paice/Husk stemmer which returns a stem for the given $input
|
||||
*
|
||||
* @param string $input The word for which we want the stem in UTF-8
|
||||
*
|
||||
* @return string The stem
|
||||
*
|
||||
* @since 3.0
|
||||
*/
|
||||
private static function getStem($input)
|
||||
{
|
||||
$vars = static::getStemRules();
|
||||
|
||||
$intact = true;
|
||||
$reversed_input = strrev(utf8_decode($input));
|
||||
$rule_number = 0;
|
||||
|
||||
// This loop goes through the rules' array until it finds an ending one (ending by '.') or the last one ('end0.')
|
||||
while (true)
|
||||
{
|
||||
$rule_number = self::getFirstRule($reversed_input, $rule_number);
|
||||
|
||||
if ($rule_number == -1)
|
||||
{
|
||||
// No other rule can be applied => the stem has been found
|
||||
break;
|
||||
}
|
||||
|
||||
$rule = $vars['rules'][$rule_number];
|
||||
preg_match($vars['rule_pattern'], $rule, $matches);
|
||||
|
||||
if ($matches[2] != '*' || $intact)
|
||||
{
|
||||
$reversed_stem = utf8_decode($matches[4]) . substr($reversed_input, $matches[3], strlen($reversed_input) - $matches[3]);
|
||||
|
||||
if (self::check($reversed_stem))
|
||||
{
|
||||
$reversed_input = $reversed_stem;
|
||||
|
||||
if ($matches[5] == '.')
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Go to another rule
|
||||
$rule_number++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Go to another rule
|
||||
$rule_number++;
|
||||
}
|
||||
}
|
||||
|
||||
return utf8_encode(strrev($reversed_input));
|
||||
}
|
||||
}
|
|
@ -1,446 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
JLoader::register('FinderIndexerStemmer', dirname(__DIR__) . '/stemmer.php');
|
||||
|
||||
/**
|
||||
* Porter English stemmer class for the Finder indexer package.
|
||||
*
|
||||
* This class was adapted from one written by Richard Heyes.
|
||||
* See copyright and link information above.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
class FinderIndexerStemmerPorter_En extends FinderIndexerStemmer
|
||||
{
|
||||
/**
|
||||
* Regex for matching a consonant.
|
||||
*
|
||||
* @var string
|
||||
* @since 2.5
|
||||
*/
|
||||
private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
|
||||
|
||||
/**
|
||||
* Regex for matching a vowel
|
||||
*
|
||||
* @var string
|
||||
* @since 2.5
|
||||
*/
|
||||
private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
|
||||
|
||||
/**
|
||||
* Method to stem a token and return the root.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
* @param string $lang The language of the token.
|
||||
*
|
||||
* @return string The root token.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
public function stem($token, $lang)
|
||||
{
|
||||
// Check if the token is long enough to merit stemming.
|
||||
if (strlen($token) <= 2)
|
||||
{
|
||||
return $token;
|
||||
}
|
||||
|
||||
// Check if the language is English or All.
|
||||
if ($lang !== 'en' && $lang != '*')
|
||||
{
|
||||
return $token;
|
||||
}
|
||||
|
||||
// Stem the token if it is not in the cache.
|
||||
if (!isset($this->cache[$lang][$token]))
|
||||
{
|
||||
// Stem the token.
|
||||
$result = $token;
|
||||
$result = self::step1ab($result);
|
||||
$result = self::step1c($result);
|
||||
$result = self::step2($result);
|
||||
$result = self::step3($result);
|
||||
$result = self::step4($result);
|
||||
$result = self::step5($result);
|
||||
|
||||
// Add the token to the cache.
|
||||
$this->cache[$lang][$token] = $result;
|
||||
}
|
||||
|
||||
return $this->cache[$lang][$token];
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
*
|
||||
* @param string $word The token to stem.
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
private static function step1ab($word)
|
||||
{
|
||||
// Part a
|
||||
if (substr($word, -1) == 's')
|
||||
{
|
||||
self::replace($word, 'sses', 'ss')
|
||||
|| self::replace($word, 'ies', 'i')
|
||||
|| self::replace($word, 'ss', 'ss')
|
||||
|| self::replace($word, 's', '');
|
||||
}
|
||||
|
||||
// Part b
|
||||
if (substr($word, -2, 1) != 'e' || !self::replace($word, 'eed', 'ee', 0))
|
||||
{
|
||||
// First rule
|
||||
$v = self::$regex_vowel;
|
||||
|
||||
// Words ending with ing and ed
|
||||
// Note use of && and OR, for precedence reasons
|
||||
if (preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
|
||||
|| preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', ''))
|
||||
{
|
||||
// If one of above two test successful
|
||||
if (!self::replace($word, 'at', 'ate') && !self::replace($word, 'bl', 'ble') && !self::replace($word, 'iz', 'ize'))
|
||||
{
|
||||
// Double consonant ending
|
||||
if (self::doubleConsonant($word) && substr($word, -2) != 'll' && substr($word, -2) != 'ss' && substr($word, -2) != 'zz')
|
||||
{
|
||||
$word = substr($word, 0, -1);
|
||||
}
|
||||
elseif (self::m($word) == 1 && self::cvc($word))
|
||||
{
|
||||
$word .= 'e';
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1c
|
||||
*
|
||||
* @param string $word The token to stem.
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
private static function step1c($word)
|
||||
{
|
||||
$v = self::$regex_vowel;
|
||||
|
||||
if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1)))
|
||||
{
|
||||
self::replace($word, 'y', 'i');
|
||||
}
|
||||
|
||||
return $word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
*
|
||||
* @param string $word The token to stem.
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
private static function step2($word)
|
||||
{
|
||||
switch (substr($word, -2, 1))
|
||||
{
|
||||
case 'a':
|
||||
self::replace($word, 'ational', 'ate', 0)
|
||||
|| self::replace($word, 'tional', 'tion', 0);
|
||||
break;
|
||||
case 'c':
|
||||
self::replace($word, 'enci', 'ence', 0)
|
||||
|| self::replace($word, 'anci', 'ance', 0);
|
||||
break;
|
||||
case 'e':
|
||||
self::replace($word, 'izer', 'ize', 0);
|
||||
break;
|
||||
case 'g':
|
||||
self::replace($word, 'logi', 'log', 0);
|
||||
break;
|
||||
case 'l':
|
||||
self::replace($word, 'entli', 'ent', 0)
|
||||
|| self::replace($word, 'ousli', 'ous', 0)
|
||||
|| self::replace($word, 'alli', 'al', 0)
|
||||
|| self::replace($word, 'bli', 'ble', 0)
|
||||
|| self::replace($word, 'eli', 'e', 0);
|
||||
break;
|
||||
case 'o':
|
||||
self::replace($word, 'ization', 'ize', 0)
|
||||
|| self::replace($word, 'ation', 'ate', 0)
|
||||
|| self::replace($word, 'ator', 'ate', 0);
|
||||
break;
|
||||
case 's':
|
||||
self::replace($word, 'iveness', 'ive', 0)
|
||||
|| self::replace($word, 'fulness', 'ful', 0)
|
||||
|| self::replace($word, 'ousness', 'ous', 0)
|
||||
|| self::replace($word, 'alism', 'al', 0);
|
||||
break;
|
||||
case 't':
|
||||
self::replace($word, 'biliti', 'ble', 0)
|
||||
|| self::replace($word, 'aliti', 'al', 0)
|
||||
|| self::replace($word, 'iviti', 'ive', 0);
|
||||
break;
|
||||
}
|
||||
|
||||
return $word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3
|
||||
*
|
||||
* @param string $word The token to stem.
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
private static function step3($word)
|
||||
{
|
||||
switch (substr($word, -2, 1))
|
||||
{
|
||||
case 'a':
|
||||
self::replace($word, 'ical', 'ic', 0);
|
||||
break;
|
||||
case 's':
|
||||
self::replace($word, 'ness', '', 0);
|
||||
break;
|
||||
case 't':
|
||||
self::replace($word, 'icate', 'ic', 0)
|
||||
|| self::replace($word, 'iciti', 'ic', 0);
|
||||
break;
|
||||
case 'u':
|
||||
self::replace($word, 'ful', '', 0);
|
||||
break;
|
||||
case 'v':
|
||||
self::replace($word, 'ative', '', 0);
|
||||
break;
|
||||
case 'z':
|
||||
self::replace($word, 'alize', 'al', 0);
|
||||
break;
|
||||
}
|
||||
|
||||
return $word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4
|
||||
*
|
||||
* @param string $word The token to stem.
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
private static function step4($word)
|
||||
{
|
||||
switch (substr($word, -2, 1))
|
||||
{
|
||||
case 'a':
|
||||
self::replace($word, 'al', '', 1);
|
||||
break;
|
||||
case 'c':
|
||||
self::replace($word, 'ance', '', 1)
|
||||
|| self::replace($word, 'ence', '', 1);
|
||||
break;
|
||||
case 'e':
|
||||
self::replace($word, 'er', '', 1);
|
||||
break;
|
||||
case 'i':
|
||||
self::replace($word, 'ic', '', 1);
|
||||
break;
|
||||
case 'l':
|
||||
self::replace($word, 'able', '', 1)
|
||||
|| self::replace($word, 'ible', '', 1);
|
||||
break;
|
||||
case 'n':
|
||||
self::replace($word, 'ant', '', 1)
|
||||
|| self::replace($word, 'ement', '', 1)
|
||||
|| self::replace($word, 'ment', '', 1)
|
||||
|| self::replace($word, 'ent', '', 1);
|
||||
break;
|
||||
case 'o':
|
||||
if (substr($word, -4) == 'tion' || substr($word, -4) == 'sion')
|
||||
{
|
||||
self::replace($word, 'ion', '', 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
self::replace($word, 'ou', '', 1);
|
||||
}
|
||||
break;
|
||||
case 's':
|
||||
self::replace($word, 'ism', '', 1);
|
||||
break;
|
||||
case 't':
|
||||
self::replace($word, 'ate', '', 1)
|
||||
|| self::replace($word, 'iti', '', 1);
|
||||
break;
|
||||
case 'u':
|
||||
self::replace($word, 'ous', '', 1);
|
||||
break;
|
||||
case 'v':
|
||||
self::replace($word, 'ive', '', 1);
|
||||
break;
|
||||
case 'z':
|
||||
self::replace($word, 'ize', '', 1);
|
||||
break;
|
||||
}
|
||||
|
||||
return $word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 5
|
||||
*
|
||||
* @param string $word The token to stem.
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
private static function step5($word)
|
||||
{
|
||||
// Part a
|
||||
if (substr($word, -1) == 'e')
|
||||
{
|
||||
if (self::m(substr($word, 0, -1)) > 1)
|
||||
{
|
||||
self::replace($word, 'e', '');
|
||||
}
|
||||
elseif (self::m(substr($word, 0, -1)) == 1)
|
||||
{
|
||||
if (!self::cvc(substr($word, 0, -1)))
|
||||
{
|
||||
self::replace($word, 'e', '');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Part b
|
||||
if (self::m($word) > 1 && self::doubleConsonant($word) && substr($word, -1) == 'l')
|
||||
{
|
||||
$word = substr($word, 0, -1);
|
||||
}
|
||||
|
||||
return $word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces the first string with the second, at the end of the string. If third
|
||||
* arg is given, then the preceding string must match that m count at least.
|
||||
*
|
||||
* @param string &$str String to check
|
||||
* @param string $check Ending to check for
|
||||
* @param string $repl Replacement string
|
||||
* @param integer $m Optional minimum number of m() to meet
|
||||
*
|
||||
* @return boolean Whether the $check string was at the end
|
||||
* of the $str string. True does not necessarily mean
|
||||
* that it was replaced.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
private static function replace(&$str, $check, $repl, $m = null)
|
||||
{
|
||||
$len = 0 - strlen($check);
|
||||
|
||||
if (substr($str, $len) == $check)
|
||||
{
|
||||
$substr = substr($str, 0, $len);
|
||||
|
||||
if (is_null($m) || self::m($substr) > $m)
|
||||
{
|
||||
$str = $substr . $repl;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* m() measures the number of consonant sequences in $str. if c is
|
||||
* a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
|
||||
* presence,
|
||||
*
|
||||
* <c><v> gives 0
|
||||
* <c>vc<v> gives 1
|
||||
* <c>vcvc<v> gives 2
|
||||
* <c>vcvcvc<v> gives 3
|
||||
*
|
||||
* @param string $str The string to return the m count for
|
||||
*
|
||||
* @return integer The m count
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
private static function m($str)
|
||||
{
|
||||
$c = self::$regex_consonant;
|
||||
$v = self::$regex_vowel;
|
||||
|
||||
$str = preg_replace("#^$c+#", '', $str);
|
||||
$str = preg_replace("#$v+$#", '', $str);
|
||||
|
||||
preg_match_all("#($v+$c+)#", $str, $matches);
|
||||
|
||||
return count($matches[1]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true/false as to whether the given string contains two
|
||||
* of the same consonant next to each other at the end of the string.
|
||||
*
|
||||
* @param string $str String to check
|
||||
*
|
||||
* @return boolean Result
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
private static function doubleConsonant($str)
|
||||
{
|
||||
$c = self::$regex_consonant;
|
||||
|
||||
return preg_match("#$c{2}$#", $str, $matches) && $matches[0]{0} == $matches[0]{1};
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks for ending CVC sequence where second C is not W, X or Y
|
||||
*
|
||||
* @param string $str String to check
|
||||
*
|
||||
* @return boolean Result
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
private static function cvc($str)
|
||||
{
|
||||
$c = self::$regex_consonant;
|
||||
$v = self::$regex_vowel;
|
||||
|
||||
return preg_match("#($c$v$c)$#", $str, $matches) && strlen($matches[1]) == 3 && $matches[1]{2} != 'w' && $matches[1]{2} != 'x'
|
||||
&& $matches[1]{2} != 'y';
|
||||
}
|
||||
}
|
|
@ -1,133 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright Copyright (C) 2005 - 2018 Open Source Matters, Inc. All rights reserved.
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
defined('_JEXEC') or die;
|
||||
|
||||
JLoader::register('FinderIndexerStemmer', dirname(__DIR__) . '/stemmer.php');
|
||||
|
||||
/**
|
||||
* Snowball stemmer class for the Finder indexer package.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
class FinderIndexerStemmerSnowball extends FinderIndexerStemmer
|
||||
{
|
||||
/**
|
||||
* Method to stem a token and return the root.
|
||||
*
|
||||
* @param string $token The token to stem.
|
||||
* @param string $lang The language of the token.
|
||||
*
|
||||
* @return string The root token.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
public function stem($token, $lang)
|
||||
{
|
||||
// Language to use if All is specified.
|
||||
static $defaultLang = '';
|
||||
|
||||
// If language is All then try to get site default language.
|
||||
if ($lang == '*' && $defaultLang == '')
|
||||
{
|
||||
$languages = JLanguageHelper::getLanguages();
|
||||
$defaultLang = $languages[0]->sef ?? '*';
|
||||
$lang = $defaultLang;
|
||||
}
|
||||
|
||||
// Stem the token if it is not in the cache.
|
||||
if (!isset($this->cache[$lang][$token]))
|
||||
{
|
||||
// Get the stem function from the language string.
|
||||
switch ($lang)
|
||||
{
|
||||
// Danish stemmer.
|
||||
case 'da':
|
||||
$function = 'stem_danish';
|
||||
break;
|
||||
|
||||
// German stemmer.
|
||||
case 'de':
|
||||
$function = 'stem_german';
|
||||
break;
|
||||
|
||||
// English stemmer.
|
||||
default:
|
||||
case 'en':
|
||||
$function = 'stem_english';
|
||||
break;
|
||||
|
||||
// Spanish stemmer.
|
||||
case 'es':
|
||||
$function = 'stem_spanish';
|
||||
break;
|
||||
|
||||
// Finnish stemmer.
|
||||
case 'fi':
|
||||
$function = 'stem_finnish';
|
||||
break;
|
||||
|
||||
// French stemmer.
|
||||
case 'fr':
|
||||
$function = 'stem_french';
|
||||
break;
|
||||
|
||||
// Hungarian stemmer.
|
||||
case 'hu':
|
||||
$function = 'stem_hungarian';
|
||||
break;
|
||||
|
||||
// Italian stemmer.
|
||||
case 'it':
|
||||
$function = 'stem_italian';
|
||||
break;
|
||||
|
||||
// Norwegian stemmer.
|
||||
case 'nb':
|
||||
$function = 'stem_norwegian';
|
||||
break;
|
||||
|
||||
// Dutch stemmer.
|
||||
case 'nl':
|
||||
$function = 'stem_dutch';
|
||||
break;
|
||||
|
||||
// Portuguese stemmer.
|
||||
case 'pt':
|
||||
$function = 'stem_portuguese';
|
||||
break;
|
||||
|
||||
// Romanian stemmer.
|
||||
case 'ro':
|
||||
$function = 'stem_romanian';
|
||||
break;
|
||||
|
||||
// Russian stemmer.
|
||||
case 'ru':
|
||||
$function = 'stem_russian_unicode';
|
||||
break;
|
||||
|
||||
// Swedish stemmer.
|
||||
case 'sv':
|
||||
$function = 'stem_swedish';
|
||||
break;
|
||||
|
||||
// Turkish stemmer.
|
||||
case 'tr':
|
||||
$function = 'stem_turkish_unicode';
|
||||
break;
|
||||
}
|
||||
|
||||
// Stem the word if the stemmer method exists.
|
||||
$this->cache[$lang][$token] = function_exists($function) ? $function($token) : $token;
|
||||
}
|
||||
|
||||
return $this->cache[$lang][$token];
|
||||
}
|
||||
}
|
|
@ -10,7 +10,7 @@
|
|||
defined('_JEXEC') or die;
|
||||
|
||||
/**
|
||||
* Stemmer base class for the Finder indexer package.
|
||||
* Taxonomy base class for the Finder indexer package.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
|
|
|
@ -98,7 +98,14 @@ class FinderIndexerToken
|
|||
*/
|
||||
public function __construct($term, $lang, $spacer = ' ')
|
||||
{
|
||||
$this->language = $lang;
|
||||
if (!$lang)
|
||||
{
|
||||
$this->language = '*';
|
||||
}
|
||||
else
|
||||
{
|
||||
$this->language = $lang;
|
||||
}
|
||||
|
||||
// Tokens can be a single word or an array of words representing a phrase.
|
||||
if (is_array($term))
|
||||
|
|
|
@ -35,11 +35,6 @@ COM_FINDER_CONFIG_SORT_OPTION_LIST_PRICE="List price"
|
|||
COM_FINDER_CONFIG_SORT_OPTION_RELEVANCE="Relevance"
|
||||
COM_FINDER_CONFIG_SORT_OPTION_START_DATE="Date"
|
||||
COM_FINDER_CONFIG_SORT_ORDER_LABEL="Sort Field"
|
||||
COM_FINDER_CONFIG_STEMMER_ENABLE_LABEL="Enable Language Stemmer"
|
||||
COM_FINDER_CONFIG_STEMMER_FR="French Only"
|
||||
COM_FINDER_CONFIG_STEMMER_LABEL="Select Language Stemmer"
|
||||
COM_FINDER_CONFIG_STEMMER_PORTER_EN="English Only"
|
||||
COM_FINDER_CONFIG_STEMMER_SNOWBALL="Snowball"
|
||||
COM_FINDER_CONFIG_TEXT_MULTIPLIER_DESCRIPTION="The multiplier is used to control how much influence matching text has on the overall relevance score of a search result. A multiplier is considered in relationship to the other multipliers. The body text comes from the summary and/or body of the content."
|
||||
COM_FINDER_CONFIG_TEXT_MULTIPLIER_LABEL="Body Text Weight Multiplier"
|
||||
COM_FINDER_CONFIG_TITLE_MULTIPLIER_DESCRIPTION="The multiplier is used to control how much influence matching text has on the overall relevance score of a search result. A multiplier is considered in relationship to the other multipliers. The title text comes from the title of the content."
|
||||
|
|
|
@ -37,6 +37,7 @@
|
|||
<exec executable="phpcs">
|
||||
<arg value="--report=checkstyle" />
|
||||
<arg value="--extensions=php" />
|
||||
<arg value="--encoding=utf-8" />
|
||||
<arg value="-p" />
|
||||
<arg value="--report-file=${basedir}/build/logs/checkstyle.xml" />
|
||||
<arg value="--standard=${basedir}/build/phpcs/Joomla" />
|
||||
|
|
|
@ -18,9 +18,9 @@ use Joomla\Utilities\ArrayHelper;
|
|||
// Register dependent classes.
|
||||
define('FINDER_PATH_INDEXER', JPATH_ADMINISTRATOR . '/components/com_finder/helpers/indexer');
|
||||
\JLoader::register('FinderIndexerHelper', FINDER_PATH_INDEXER . '/helper.php');
|
||||
\JLoader::register('FinderIndexerLanguage', FINDER_PATH_INDEXER . '/language.php');
|
||||
\JLoader::register('FinderIndexerQuery', FINDER_PATH_INDEXER . '/query.php');
|
||||
\JLoader::register('FinderIndexerResult', FINDER_PATH_INDEXER . '/result.php');
|
||||
\JLoader::register('FinderIndexerStemmer', FINDER_PATH_INDEXER . '/stemmer.php');
|
||||
|
||||
/**
|
||||
* Search model class for the Finder package.
|
||||
|
@ -221,6 +221,7 @@ class SearchModel extends ListModel
|
|||
$query->where($db->quoteName('l.start_date') . ' = ' . $date2);
|
||||
}
|
||||
}
|
||||
|
||||
// Filter by language
|
||||
if ($this->getState('filter.language'))
|
||||
{
|
||||
|
@ -266,12 +267,27 @@ class SearchModel extends ListModel
|
|||
* If there are no optional or required search terms in the query, we
|
||||
* can get the results in one relatively simple database query.
|
||||
*/
|
||||
if (empty($this->includedTerms))
|
||||
if (empty($this->includedTerms) && $this->searchquery->empty)
|
||||
{
|
||||
// Return the results.
|
||||
return $query;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there are no optional or required search terms in the query and
|
||||
* empty searches are not allowed, we return an empty query.
|
||||
*/
|
||||
if (empty($this->includedTerms) && !$this->searchquery->empty)
|
||||
{
|
||||
// Since we need to return a query, we simplify this one.
|
||||
$query->clear('join')
|
||||
->clear('where')
|
||||
->clear('group')
|
||||
->where('false');
|
||||
|
||||
return $query;
|
||||
}
|
||||
|
||||
$included = call_user_func_array('array_merge', $this->includedTerms);
|
||||
$query->join('INNER', $this->_db->quoteName('#__finder_links_terms') . ' AS m ON m.link_id = l.link_id')
|
||||
->where('m.term_id IN (' . implode(',', $included) . ')');
|
||||
|
@ -291,10 +307,14 @@ class SearchModel extends ListModel
|
|||
*/
|
||||
if (count($this->requiredTerms))
|
||||
{
|
||||
$required = call_user_func_array('array_merge', $this->requiredTerms);
|
||||
$query->join('INNER', $this->_db->quoteName('#__finder_links_terms') . ' AS r ON r.link_id = l.link_id')
|
||||
->where('r.term_id IN (' . implode(',', $required) . ')')
|
||||
->having('COUNT(DISTINCT r.term_id) = ' . count($required));
|
||||
$i = 0;
|
||||
|
||||
foreach ($this->requiredTerms as $terms)
|
||||
{
|
||||
$query->join('INNER', $this->_db->quoteName('#__finder_links_terms') . ' AS r' . $i . ' ON r' . $i . '.link_id = l.link_id')
|
||||
->where('r' . $i . '.term_id IN (' . implode(',', $terms) . ')');
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
|
||||
return $query;
|
||||
|
@ -361,12 +381,6 @@ class SearchModel extends ListModel
|
|||
|
||||
$this->setState('filter.language', Multilanguage::isEnabled());
|
||||
|
||||
// Setup the stemmer.
|
||||
if ($params->get('stem', 1) && $params->get('stemmer', 'porter_en'))
|
||||
{
|
||||
\FinderIndexerHelper::$stemmer = \FinderIndexerStemmer::getInstance($params->get('stemmer', 'porter_en'));
|
||||
}
|
||||
|
||||
$request = $input->request;
|
||||
$options = array();
|
||||
|
||||
|
|
|
@ -65,13 +65,14 @@ class SuggestionsModel extends ListModel
|
|||
// Create a new query object.
|
||||
$db = $this->getDbo();
|
||||
$query = $db->getQuery(true);
|
||||
$lang = \FinderIndexerHelper::getPrimaryLanguage($this->getState('language'));
|
||||
|
||||
// Select required fields
|
||||
$query->select('t.term')
|
||||
->from($db->quoteName('#__finder_terms') . ' AS t')
|
||||
->where('t.term LIKE ' . $db->quote($db->escape($this->getState('input'), true) . '%'))
|
||||
->where('t.common = 0')
|
||||
->where('t.language IN (' . $db->quote($db->escape($this->getState('language'), true)) . ', ' . $db->quote('*') . ')')
|
||||
->where('t.language IN (' . $db->quote($lang) . ', ' . $db->quote('*') . ')')
|
||||
->order('t.links DESC')
|
||||
->order('t.weight DESC');
|
||||
|
||||
|
|
|
@ -80,7 +80,8 @@
|
|||
"symfony/debug": "3.4.*",
|
||||
"symfony/ldap": "3.4.*",
|
||||
"symfony/web-link": "3.4.*",
|
||||
"symfony/yaml": "3.4.*"
|
||||
"symfony/yaml": "3.4.*",
|
||||
"wamania/php-stemmer": "^1.2"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "~6.0",
|
||||
|
|
48
composer.lock
generated
48
composer.lock
generated
|
@ -1,10 +1,10 @@
|
|||
{
|
||||
"_readme": [
|
||||
"This file locks the dependencies of your project to a known state",
|
||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "3bd7695b38b737c3b555d83058b877fd",
|
||||
"content-hash": "81c9ca521a0712b07e07143b469e835a",
|
||||
"packages": [
|
||||
{
|
||||
"name": "composer/ca-bundle",
|
||||
|
@ -2513,6 +2513,50 @@
|
|||
"homepage": "https://symfony.com",
|
||||
"time": "2018-05-03T23:18:14+00:00"
|
||||
},
|
||||
{
|
||||
"name": "wamania/php-stemmer",
|
||||
"version": "1.2",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/wamania/php-stemmer.git",
|
||||
"reference": "6cc76829bddd46f7ae7678e0bf87a0c872c8cf58"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/wamania/php-stemmer/zipball/6cc76829bddd46f7ae7678e0bf87a0c872c8cf58",
|
||||
"reference": "6cc76829bddd46f7ae7678e0bf87a0c872c8cf58",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"php": ">=5.3.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "^4.8"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"Wamania\\Snowball\\": "src/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Wamania",
|
||||
"homepage": "http://wamania.com"
|
||||
}
|
||||
],
|
||||
"description": "Native PHP5 Stemmer",
|
||||
"keywords": [
|
||||
"php",
|
||||
"porter",
|
||||
"stemmer"
|
||||
],
|
||||
"time": "2017-01-27T17:16:44+00:00"
|
||||
},
|
||||
{
|
||||
"name": "zendframework/zend-diactoros",
|
||||
"version": "1.7.2",
|
||||
|
|
4
libraries/vendor/composer/ClassLoader.php
vendored
4
libraries/vendor/composer/ClassLoader.php
vendored
|
@ -379,9 +379,9 @@ class ClassLoader
|
|||
$subPath = substr($subPath, 0, $lastPos);
|
||||
$search = $subPath.'\\';
|
||||
if (isset($this->prefixDirsPsr4[$search])) {
|
||||
$pathEnd = DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $lastPos + 1);
|
||||
foreach ($this->prefixDirsPsr4[$search] as $dir) {
|
||||
if (file_exists($file = $dir . $pathEnd)) {
|
||||
$length = $this->prefixLengthsPsr4[$first][$search];
|
||||
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $length))) {
|
||||
return $file;
|
||||
}
|
||||
}
|
||||
|
|
15
libraries/vendor/composer/autoload_classmap.php
vendored
15
libraries/vendor/composer/autoload_classmap.php
vendored
|
@ -946,6 +946,21 @@ return array(
|
|||
'Symfony\\Polyfill\\Util\\BinaryOnFuncOverload' => $vendorDir . '/symfony/polyfill-util/BinaryOnFuncOverload.php',
|
||||
'Symfony\\Polyfill\\Util\\LegacyTestListener' => $vendorDir . '/symfony/polyfill-util/LegacyTestListener.php',
|
||||
'Symfony\\Polyfill\\Util\\TestListenerTrait' => $vendorDir . '/symfony/polyfill-util/TestListenerTrait.php',
|
||||
'Wamania\\Snowball\\Danish' => $vendorDir . '/wamania/php-stemmer/src/Danish.php',
|
||||
'Wamania\\Snowball\\Dutch' => $vendorDir . '/wamania/php-stemmer/src/Dutch.php',
|
||||
'Wamania\\Snowball\\English' => $vendorDir . '/wamania/php-stemmer/src/English.php',
|
||||
'Wamania\\Snowball\\French' => $vendorDir . '/wamania/php-stemmer/src/French.php',
|
||||
'Wamania\\Snowball\\German' => $vendorDir . '/wamania/php-stemmer/src/German.php',
|
||||
'Wamania\\Snowball\\Italian' => $vendorDir . '/wamania/php-stemmer/src/Italian.php',
|
||||
'Wamania\\Snowball\\Norwegian' => $vendorDir . '/wamania/php-stemmer/src/Norwegian.php',
|
||||
'Wamania\\Snowball\\Portuguese' => $vendorDir . '/wamania/php-stemmer/src/Portuguese.php',
|
||||
'Wamania\\Snowball\\Romanian' => $vendorDir . '/wamania/php-stemmer/src/Romanian.php',
|
||||
'Wamania\\Snowball\\Russian' => $vendorDir . '/wamania/php-stemmer/src/Russian.php',
|
||||
'Wamania\\Snowball\\Spanish' => $vendorDir . '/wamania/php-stemmer/src/Spanish.php',
|
||||
'Wamania\\Snowball\\Stem' => $vendorDir . '/wamania/php-stemmer/src/Stem.php',
|
||||
'Wamania\\Snowball\\Stemmer' => $vendorDir . '/wamania/php-stemmer/src/Stemmer.php',
|
||||
'Wamania\\Snowball\\Swedish' => $vendorDir . '/wamania/php-stemmer/src/Swedish.php',
|
||||
'Wamania\\Snowball\\Utf8' => $vendorDir . '/wamania/php-stemmer/src/Utf8.php',
|
||||
'Zend\\Diactoros\\AbstractSerializer' => $vendorDir . '/zendframework/zend-diactoros/src/AbstractSerializer.php',
|
||||
'Zend\\Diactoros\\CallbackStream' => $vendorDir . '/zendframework/zend-diactoros/src/CallbackStream.php',
|
||||
'Zend\\Diactoros\\Exception\\DeprecatedMethodException' => $vendorDir . '/zendframework/zend-diactoros/src/Exception/DeprecatedMethodException.php',
|
||||
|
|
1
libraries/vendor/composer/autoload_psr4.php
vendored
1
libraries/vendor/composer/autoload_psr4.php
vendored
|
@ -7,6 +7,7 @@ $baseDir = dirname(dirname($vendorDir));
|
|||
|
||||
return array(
|
||||
'Zend\\Diactoros\\' => array($vendorDir . '/zendframework/zend-diactoros/src'),
|
||||
'Wamania\\Snowball\\' => array($vendorDir . '/wamania/php-stemmer/src'),
|
||||
'Symfony\\Polyfill\\Util\\' => array($vendorDir . '/symfony/polyfill-util'),
|
||||
'Symfony\\Polyfill\\Php56\\' => array($vendorDir . '/symfony/polyfill-php56'),
|
||||
'Symfony\\Polyfill\\Mbstring\\' => array($vendorDir . '/symfony/polyfill-mbstring'),
|
||||
|
|
23
libraries/vendor/composer/autoload_static.php
vendored
23
libraries/vendor/composer/autoload_static.php
vendored
|
@ -34,6 +34,10 @@ class ComposerStaticInita4c4383b02fcf9dfb95cc0397c641cf1
|
|||
array (
|
||||
'Zend\\Diactoros\\' => 15,
|
||||
),
|
||||
'W' =>
|
||||
array (
|
||||
'Wamania\\Snowball\\' => 17,
|
||||
),
|
||||
'S' =>
|
||||
array (
|
||||
'Symfony\\Polyfill\\Util\\' => 22,
|
||||
|
@ -110,6 +114,10 @@ class ComposerStaticInita4c4383b02fcf9dfb95cc0397c641cf1
|
|||
array (
|
||||
0 => __DIR__ . '/..' . '/zendframework/zend-diactoros/src',
|
||||
),
|
||||
'Wamania\\Snowball\\' =>
|
||||
array (
|
||||
0 => __DIR__ . '/..' . '/wamania/php-stemmer/src',
|
||||
),
|
||||
'Symfony\\Polyfill\\Util\\' =>
|
||||
array (
|
||||
0 => __DIR__ . '/..' . '/symfony/polyfill-util',
|
||||
|
@ -1233,6 +1241,21 @@ class ComposerStaticInita4c4383b02fcf9dfb95cc0397c641cf1
|
|||
'Symfony\\Polyfill\\Util\\BinaryOnFuncOverload' => __DIR__ . '/..' . '/symfony/polyfill-util/BinaryOnFuncOverload.php',
|
||||
'Symfony\\Polyfill\\Util\\LegacyTestListener' => __DIR__ . '/..' . '/symfony/polyfill-util/LegacyTestListener.php',
|
||||
'Symfony\\Polyfill\\Util\\TestListenerTrait' => __DIR__ . '/..' . '/symfony/polyfill-util/TestListenerTrait.php',
|
||||
'Wamania\\Snowball\\Danish' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Danish.php',
|
||||
'Wamania\\Snowball\\Dutch' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Dutch.php',
|
||||
'Wamania\\Snowball\\English' => __DIR__ . '/..' . '/wamania/php-stemmer/src/English.php',
|
||||
'Wamania\\Snowball\\French' => __DIR__ . '/..' . '/wamania/php-stemmer/src/French.php',
|
||||
'Wamania\\Snowball\\German' => __DIR__ . '/..' . '/wamania/php-stemmer/src/German.php',
|
||||
'Wamania\\Snowball\\Italian' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Italian.php',
|
||||
'Wamania\\Snowball\\Norwegian' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Norwegian.php',
|
||||
'Wamania\\Snowball\\Portuguese' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Portuguese.php',
|
||||
'Wamania\\Snowball\\Romanian' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Romanian.php',
|
||||
'Wamania\\Snowball\\Russian' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Russian.php',
|
||||
'Wamania\\Snowball\\Spanish' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Spanish.php',
|
||||
'Wamania\\Snowball\\Stem' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Stem.php',
|
||||
'Wamania\\Snowball\\Stemmer' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Stemmer.php',
|
||||
'Wamania\\Snowball\\Swedish' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Swedish.php',
|
||||
'Wamania\\Snowball\\Utf8' => __DIR__ . '/..' . '/wamania/php-stemmer/src/Utf8.php',
|
||||
'Zend\\Diactoros\\AbstractSerializer' => __DIR__ . '/..' . '/zendframework/zend-diactoros/src/AbstractSerializer.php',
|
||||
'Zend\\Diactoros\\CallbackStream' => __DIR__ . '/..' . '/zendframework/zend-diactoros/src/CallbackStream.php',
|
||||
'Zend\\Diactoros\\Exception\\DeprecatedMethodException' => __DIR__ . '/..' . '/zendframework/zend-diactoros/src/Exception/DeprecatedMethodException.php',
|
||||
|
|
46
libraries/vendor/composer/installed.json
vendored
46
libraries/vendor/composer/installed.json
vendored
|
@ -2652,5 +2652,51 @@
|
|||
"psr",
|
||||
"psr-7"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "wamania/php-stemmer",
|
||||
"version": "1.2",
|
||||
"version_normalized": "1.2.0.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/wamania/php-stemmer.git",
|
||||
"reference": "6cc76829bddd46f7ae7678e0bf87a0c872c8cf58"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/wamania/php-stemmer/zipball/6cc76829bddd46f7ae7678e0bf87a0c872c8cf58",
|
||||
"reference": "6cc76829bddd46f7ae7678e0bf87a0c872c8cf58",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"php": ">=5.3.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "^4.8"
|
||||
},
|
||||
"time": "2017-01-27T17:16:44+00:00",
|
||||
"type": "library",
|
||||
"installation-source": "dist",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"Wamania\\Snowball\\": "src/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Wamania",
|
||||
"homepage": "http://wamania.com"
|
||||
}
|
||||
],
|
||||
"description": "Native PHP5 Stemmer",
|
||||
"keywords": [
|
||||
"php",
|
||||
"porter",
|
||||
"stemmer"
|
||||
]
|
||||
}
|
||||
]
|
||||
|
|
21
libraries/vendor/wamania/php-stemmer/LICENSE
vendored
Normal file
21
libraries/vendor/wamania/php-stemmer/LICENSE
vendored
Normal file
|
@ -0,0 +1,21 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2016 wamania
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
149
libraries/vendor/wamania/php-stemmer/src/Danish.php
vendored
Normal file
149
libraries/vendor/wamania/php-stemmer/src/Danish.php
vendored
Normal file
|
@ -0,0 +1,149 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/danish/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Danish extends Stem
|
||||
{
|
||||
/**
|
||||
* All danish vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'æ', 'å', 'ø');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
// R2 is not used: R1 is defined in the same way as in the German stemmer
|
||||
$this->r1();
|
||||
|
||||
// then R1 is adjusted so that the region before it contains at least 3 letters.
|
||||
if ($this->r1Index < 3) {
|
||||
$this->r1Index = 3;
|
||||
$this->r1 = Utf8::substr($this->word, 3);
|
||||
}
|
||||
|
||||
// Do each of steps 1, 2 3 and 4.
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
$this->step4();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a valid s-ending as one of
|
||||
* a b c d f g h j k l m n o p r t v y z å
|
||||
*
|
||||
* @param string $ending
|
||||
* @return boolean
|
||||
*/
|
||||
private function hasValidSEnding($word)
|
||||
{
|
||||
$lastLetter = Utf8::substr($word, -1, 1);
|
||||
return in_array($lastLetter, array('a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
* Search for the longest among the following suffixes in R1, and perform the action indicated.
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// hed ethed ered e erede ende erende ene erne ere en heden eren er heder erer
|
||||
// heds es endes erendes enes ernes eres ens hedens erens ers ets erets et eret
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInR1(array(
|
||||
'erendes', 'erende', 'hedens', 'erede', 'ethed', 'heden', 'endes', 'erets', 'heder', 'ernes',
|
||||
'erens', 'ered', 'ende', 'erne', 'eres', 'eren', 'eret', 'erer', 'enes', 'heds',
|
||||
'ens', 'ene', 'ere', 'ers', 'ets', 'hed', 'es', 'et', 'er', 'en', 'e'
|
||||
))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// s
|
||||
// delete if preceded by a valid s-ending
|
||||
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
|
||||
$word = Utf8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidSEnding($word)) {
|
||||
$this->word = $word;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
* Search for one of the following suffixes in R1, and if found delete the last letter.
|
||||
* gd dt gt kt
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
if ($this->searchIfInR1(array('gd', 'dt', 'gt', 'kt')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3:
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// If the word ends igst, remove the final st.
|
||||
if ($this->search(array('igst')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -2);
|
||||
}
|
||||
|
||||
// Search for the longest among the following suffixes in R1, and perform the action indicated.
|
||||
// ig lig elig els
|
||||
// delete, and then repeat step 2
|
||||
if ( ($position = $this->searchIfInR1(array('elig', 'lig', 'ig', 'els'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->step2();
|
||||
return true;
|
||||
}
|
||||
|
||||
// løst
|
||||
// replace with løs
|
||||
if ($this->searchIfInR1(array('løst')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: undouble
|
||||
* If the word ends with double consonant in R1, remove one of the consonants.
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
$length = Utf8::strlen($this->word);
|
||||
if (!$this->inR1(($length-1))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$lastLetter = Utf8::substr($this->word, -1, 1);
|
||||
if (in_array($lastLetter, self::$vowels)) {
|
||||
return false;
|
||||
}
|
||||
$beforeLastLetter = Utf8::substr($this->word, -2, 1);
|
||||
|
||||
if ($lastLetter == $beforeLastLetter) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
303
libraries/vendor/wamania/php-stemmer/src/Dutch.php
vendored
Normal file
303
libraries/vendor/wamania/php-stemmer/src/Dutch.php
vendored
Normal file
|
@ -0,0 +1,303 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/dutch/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Dutch extends Stem
|
||||
{
|
||||
/**
|
||||
* All dutch vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'è');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
// First, remove all umlaut and acute accents.
|
||||
$this->word = Utf8::str_replace(
|
||||
array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'),
|
||||
array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'),
|
||||
$this->word);
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
// Put initial y, y after a vowel, and i between vowels into upper case.
|
||||
$this->word = preg_replace('#^y#u', 'Y', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
|
||||
|
||||
// R1 and R2 (see the note on R1 and R2) are then defined as in German.
|
||||
// R1 and R2 are first set up in the standard way
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
// but then R1 is adjusted so that the region before it contains at least 3 letters.
|
||||
if ($this->r1Index < 3) {
|
||||
$this->r1Index = 3;
|
||||
$this->r1 = Utf8::substr($this->word, 3);
|
||||
}
|
||||
|
||||
// Do each of steps 1, 2 3 and 4.
|
||||
$this->step1();
|
||||
$removedE = $this->step2();
|
||||
$this->step3a();
|
||||
$this->step3b($removedE);
|
||||
$this->step4();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a valid s-ending as a non-vowel other than j.
|
||||
* @param string $ending
|
||||
* @return boolean
|
||||
*/
|
||||
private function hasValidSEnding($word)
|
||||
{
|
||||
$lastLetter = Utf8::substr($word, -1, 1);
|
||||
return !in_array($lastLetter, array_merge(self::$vowels, array('j')));
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a valid en-ending as a non-vowel, and not gem.
|
||||
* @param string $ending
|
||||
* @return boolean
|
||||
*/
|
||||
private function hasValidEnEnding($word)
|
||||
{
|
||||
$lastLetter = Utf8::substr($word, -1, 1);
|
||||
if (in_array($lastLetter, self::$vowels)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$threeLastLetters = Utf8::substr($word, -3, 3);
|
||||
if ($threeLastLetters == 'gem') {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define undoubling the ending as removing the last letter if the word ends kk, dd or tt.
|
||||
*/
|
||||
private function unDoubling()
|
||||
{
|
||||
if ($this->search(array('kk', 'dd', 'tt')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
* Search for the longest among the following suffixes, and perform the action indicated
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// heden
|
||||
// replace with heid if in R1
|
||||
if ( ($position = $this->search(array('heden'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(heden)$#u', 'heid', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// en ene
|
||||
// delete if in R1 and preceded by a valid en-ending, and then undouble the ending
|
||||
if ( ($position = $this->search(array('ene', 'en'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$word = Utf8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidEnEnding($word)) {
|
||||
$this->word = $word;
|
||||
$this->unDoubling();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// s se
|
||||
// delete if in R1 and preceded by a valid s-ending
|
||||
if ( ($position = $this->search(array('se', 's'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$word = Utf8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidSEnding($word)) {
|
||||
$this->word = $word;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
* Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
if ( ($position = $this->search(array('e'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$letter = Utf8::substr($this->word, -2, 1);
|
||||
if (!in_array($letter, self::$vowels)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->unDoubling();
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3a: heid
|
||||
* delete heid if in R2 and not preceded by c, and treat a preceding en as in step 1(b)
|
||||
*/
|
||||
private function step3a()
|
||||
{
|
||||
if ( ($position = $this->search(array('heid'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$letter = Utf8::substr($this->word, -5, 1);
|
||||
if ($letter !== 'c') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position = $this->search(array('en'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$word = Utf8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidEnEnding($word)) {
|
||||
$this->word = $word;
|
||||
$this->unDoubling();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3b: d-suffixe
|
||||
* Search for the longest among the following suffixes, and perform the action indicated.
|
||||
*/
|
||||
private function step3b($removedE)
|
||||
{
|
||||
// end ing
|
||||
// delete if in R2
|
||||
// if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending
|
||||
if ( ($position = $this->search(array('end', 'ing'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) {
|
||||
$letter = Utf8::substr($this->word, -3, 1);
|
||||
if ($letter !== 'e') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
} else {
|
||||
$this->unDoubling();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ig
|
||||
// delete if in R2 and not preceded by e
|
||||
if ( ($position = $this->search(array('ig'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$letter = Utf8::substr($this->word, -3, 1);
|
||||
if ($letter !== 'e') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// lijk
|
||||
// delete if in R2, and then repeat step 2
|
||||
if ( ($position = $this->search(array('lijk'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->step2();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// baar
|
||||
// delete if in R2
|
||||
if ( ($position = $this->search(array('baar'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// bar
|
||||
// delete if in R2 and if step 2 actually removed an e
|
||||
if ( ($position = $this->search(array('bar'))) !== false) {
|
||||
if ($this->inR2($position) && $removedE) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: undouble vowel
|
||||
* If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u,
|
||||
* remove one of the vowels from V (for example, maan -> man, brood -> brod).
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
// D is a non-vowel other than I
|
||||
$d = Utf8::substr($this->word, -1, 1);
|
||||
if (in_array($d, array_merge(self::$vowels, array('I')))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// V is double a, e, o or u
|
||||
$v = Utf8::substr($this->word, -3, 2);
|
||||
if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) {
|
||||
return false;
|
||||
}
|
||||
$singleV = Utf8::substr($v, 0, 1);
|
||||
|
||||
// C is a non-vowel
|
||||
$c = Utf8::substr($this->word, -4, 1);
|
||||
if (in_array($c, self::$vowels)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$this->word = Utf8::substr($this->word, 0, -4);
|
||||
$this->word .= $c . $singleV .$d;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finally
|
||||
* Turn I and Y back into lower case.
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
$this->word = Utf8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word);
|
||||
}
|
||||
}
|
599
libraries/vendor/wamania/php-stemmer/src/English.php
vendored
Normal file
599
libraries/vendor/wamania/php-stemmer/src/English.php
vendored
Normal file
|
@ -0,0 +1,599 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
* English Porter 2
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/english/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class English extends Stem
|
||||
{
|
||||
/**
|
||||
* All english vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y');
|
||||
|
||||
protected static $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt');
|
||||
|
||||
protected static $liEnding = array('c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
if (Utf8::strlen($word) < 3) {
|
||||
return $word;
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
// exceptions
|
||||
if (null !== ($word = $this->exception1())) {
|
||||
return $word;
|
||||
}
|
||||
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
// Remove initial ', if present.
|
||||
$first = Utf8::substr($this->word, 0, 1);
|
||||
if ($first == "'") {
|
||||
$this->word = Utf8::substr($this->word, 1);
|
||||
}
|
||||
|
||||
// Set initial y, or y after a vowel, to Y
|
||||
if ($first == 'y') {
|
||||
$this->word = preg_replace('#^y#u', 'Y', $this->word);
|
||||
}
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
|
||||
|
||||
$this->r1();
|
||||
$this->exceptionR1();
|
||||
$this->r2();
|
||||
|
||||
$this->step0();
|
||||
$this->step1a();
|
||||
|
||||
// exceptions 2
|
||||
if (null !== ($word = $this->exception2())) {
|
||||
return $word;
|
||||
}
|
||||
|
||||
$this->step1b();
|
||||
$this->step1c();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
$this->step4();
|
||||
$this->step5();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 0
|
||||
* Remove ', 's, 's'
|
||||
*/
|
||||
private function step0()
|
||||
{
|
||||
if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
private function step1a()
|
||||
{
|
||||
// sses
|
||||
// replace by ss
|
||||
if ( ($position = $this->search(array('sses'))) !== false) {
|
||||
$this->word = preg_replace('#(sses)$#u', 'ss', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ied+ ies*
|
||||
// replace by i if preceded by more than one letter, otherwise by ie (so ties -> tie, cries -> cri)
|
||||
if ( ($position = $this->search(array('ied', 'ies'))) !== false) {
|
||||
if ($position > 1) {
|
||||
$this->word = preg_replace('#(ied|ies)$#u', 'i', $this->word);
|
||||
|
||||
} else {
|
||||
$this->word = preg_replace('#(ied|ies)$#u', 'ie', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// us+ ss
|
||||
// do nothing
|
||||
if ( ($position = $this->search(array('us', 'ss'))) !== false) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// s
|
||||
// delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it)
|
||||
if ( ($position = $this->search(array('s'))) !== false) {
|
||||
for ($i=0; $i<$position-1; $i++) {
|
||||
$letter = Utf8::substr($this->word, $i, 1);
|
||||
|
||||
if (in_array($letter, self::$vowels)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1b
|
||||
*/
|
||||
private function step1b()
|
||||
{
|
||||
// eed eedly+
|
||||
// replace by ee if in R1
|
||||
if ( ($position = $this->search(array('eedly', 'eed'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(eedly|eed)$#u', 'ee', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ed edly+ ing ingly+
|
||||
// delete if the preceding word part contains a vowel, and after the deletion:
|
||||
// if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
|
||||
// if the word ends with a double remove the last letter (so hopp -> hop), or
|
||||
// if the word is short, add e (so hop -> hope)
|
||||
if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) {
|
||||
for ($i=0; $i<$position; $i++) {
|
||||
$letter = Utf8::substr($this->word, $i, 1);
|
||||
|
||||
if (in_array($letter, self::$vowels)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
if ($this->search(array('at', 'bl', 'iz')) !== false) {
|
||||
$this->word .= 'e';
|
||||
|
||||
} elseif ( ($position2 = $this->search(self::$doubles)) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, ($position2+1));
|
||||
|
||||
} elseif ($this->isShort()) {
|
||||
$this->word .= 'e';
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1c: *
|
||||
*/
|
||||
private function step1c()
|
||||
{
|
||||
// replace suffix y or Y by i if preceded by a non-vowel
|
||||
// which is not the first letter of the word (so cry -> cri, by -> by, say -> say)
|
||||
$length = Utf8::strlen($this->word);
|
||||
|
||||
if ($length < 3) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( ($position = $this->search(array('y', 'Y'))) !== false) {
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
|
||||
if (! in_array($letter, self::$vowels)) {
|
||||
$this->word = preg_replace('#(y|Y)$#u', 'i', $this->word);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
* Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
// iveness iviti: replace by ive
|
||||
if ( ($position = $this->search(array('iveness', 'iviti'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(iveness|iviti)$#u', 'ive', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ousli ousness: replace by ous
|
||||
if ( ($position = $this->search(array('ousli', 'ousness'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ousli|ousness)$#u', 'ous', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// izer ization: replace by ize
|
||||
if ( ($position = $this->search(array('izer', 'ization'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(izer|ization)$#u', 'ize', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ational ation ator: replace by ate
|
||||
if ( ($position = $this->search(array('ational', 'ation', 'ator'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ational|ation|ator)$#u', 'ate', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// biliti bli+: replace by ble
|
||||
if ( ($position = $this->search(array('biliti', 'bli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(biliti|bli)$#u', 'ble', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// lessli+: replace by less
|
||||
if ( ($position = $this->search(array('lessli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(lessli)$#u', 'less', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// fulness: replace by ful
|
||||
if ( ($position = $this->search(array('fulness', 'fulli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(fulness|fulli)$#u', 'ful', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// tional: replace by tion
|
||||
if ( ($position = $this->search(array('tional'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(tional)$#u', 'tion', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// alism aliti alli: replace by al
|
||||
if ( ($position = $this->search(array('alism', 'aliti', 'alli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(alism|aliti|alli)$#u', 'al', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// enci: replace by ence
|
||||
if ( ($position = $this->search(array('enci'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(enci)$#u', 'ence', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// anci: replace by ance
|
||||
if ( ($position = $this->search(array('anci'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(anci)$#u', 'ance', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// abli: replace by able
|
||||
if ( ($position = $this->search(array('abli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(abli)$#u', 'able', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// entli: replace by ent
|
||||
if ( ($position = $this->search(array('entli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(entli)$#u', 'ent', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ogi+: replace by og if preceded by l
|
||||
if ( ($position = $this->search(array('ogi'))) !== false) {
|
||||
|
||||
if ($this->inR1($position)) {
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
|
||||
if ($letter == 'l') {
|
||||
$this->word = preg_replace('#(ogi)$#u', 'og', $this->word);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// li+: delete if preceded by a valid li-ending
|
||||
if ( ($position = $this->search(array('li'))) !== false) {
|
||||
|
||||
if ($this->inR1($position)) {
|
||||
// a letter for you
|
||||
$letter = Utf8::substr($this->word, ($position-1), 1);
|
||||
|
||||
if (in_array($letter, self::$liEnding)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3:
|
||||
* Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
|
||||
*/
|
||||
public function step3()
|
||||
{
|
||||
// ational+: replace by ate
|
||||
if ($this->searchIfInR1(array('ational')) !== false) {
|
||||
$this->word = preg_replace('#(ational)$#u', 'ate', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// tional+: replace by tion
|
||||
if ($this->searchIfInR1(array('tional')) !== false) {
|
||||
$this->word = preg_replace('#(tional)$#u', 'tion', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// alize: replace by al
|
||||
if ($this->searchIfInR1(array('alize')) !== false) {
|
||||
$this->word = preg_replace('#(alize)$#u', 'al', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// icate iciti ical: replace by ic
|
||||
if ($this->searchIfInR1(array('icate', 'iciti', 'ical')) !== false) {
|
||||
$this->word = preg_replace('#(icate|iciti|ical)$#u', 'ic', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ful ness: delete
|
||||
if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ative*: delete if in R2
|
||||
if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4
|
||||
* Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated.
|
||||
*/
|
||||
public function step4()
|
||||
{
|
||||
// ement ance ence able ible ant ment ent ism ate iti ous ive ize al er ic
|
||||
// delete
|
||||
if ( ($position = $this->search(array(
|
||||
'ance', 'ence', 'ement', 'able', 'ible', 'ant', 'ment', 'ent', 'ism',
|
||||
'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ion
|
||||
// delete if preceded by s or t
|
||||
if ( ($position = $this->searchIfInR2(array('ion'))) !== false) {
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
|
||||
if ($letter == 's' || $letter == 't') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 5: *
|
||||
* Search for the the following suffixes, and, if found, perform the action indicated.
|
||||
*/
|
||||
public function step5()
|
||||
{
|
||||
// e
|
||||
// delete if in R2, or in R1 and not preceded by a short syllable
|
||||
if ( ($position = $this->search(array('e'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
} elseif ($this->inR1($position)) {
|
||||
if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// l
|
||||
// delete if in R2 and preceded by l
|
||||
if ( ($position = $this->searchIfInR2(array('l'))) !== false) {
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
|
||||
if ($letter == 'l') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function finish()
|
||||
{
|
||||
$this->word = Utf8::str_replace('Y', 'y', $this->word);
|
||||
}
|
||||
|
||||
private function exceptionR1()
|
||||
{
|
||||
if (Utf8::strpos($this->word, 'gener') === 0) {
|
||||
$this->r1 = Utf8::substr($this->word, 5);
|
||||
$this->r1Index = 5;
|
||||
|
||||
} elseif (Utf8::strpos($this->word, 'commun') === 0) {
|
||||
$this->r1 = Utf8::substr($this->word, 6);
|
||||
$this->r1Index = 6;
|
||||
|
||||
} elseif (Utf8::strpos($this->word, 'arsen') === 0) {
|
||||
$this->r1 = Utf8::substr($this->word, 5);
|
||||
$this->r1Index = 5;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 1/ Stem certain special words as follows,
|
||||
* 2/ If one of the following is found, leave it invariant,
|
||||
*/
|
||||
private function exception1()
|
||||
{
|
||||
$exceptions = array(
|
||||
'skis' => 'ski',
|
||||
'skies' => 'sky',
|
||||
'dying' => 'die',
|
||||
'lying' => 'lie',
|
||||
'tying' => 'tie',
|
||||
'idly' => 'idl',
|
||||
'gently' => 'gentl',
|
||||
'ugly' => 'ugli',
|
||||
'early' => 'earli',
|
||||
'only' => 'onli',
|
||||
'singly' => 'singl',
|
||||
// invariants
|
||||
'sky' => 'sky',
|
||||
'news' => 'news',
|
||||
'howe' => 'howe',
|
||||
'atlas' => 'atlas',
|
||||
'cosmos' => 'cosmos',
|
||||
'bias' => 'bias',
|
||||
'andes' => 'andes'
|
||||
);
|
||||
|
||||
if (isset($exceptions[$this->word])) {
|
||||
return $exceptions[$this->word];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Following step 1a, leave the following invariant,
|
||||
*/
|
||||
private function exception2()
|
||||
{
|
||||
$exceptions = array(
|
||||
'inning' => 'inning',
|
||||
'outing' => 'outing',
|
||||
'canning' => 'canning',
|
||||
'herring' => 'herring',
|
||||
'earring' => 'earring',
|
||||
'proceed' => 'proceed',
|
||||
'exceed' => 'exceed',
|
||||
'succeed' => 'succeed'
|
||||
);
|
||||
|
||||
if (isset($exceptions[$this->word])) {
|
||||
return $exceptions[$this->word];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* A word is called short if it ends in a short syllable, and if R1 is null.
|
||||
* Note : R1 not really null, but the word at this state must be smaller than r1 index
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
private function isShort()
|
||||
{
|
||||
$length = Utf8::strlen($this->word);
|
||||
return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel,
|
||||
* or * (b) a vowel at the beginning of the word followed by a non-vowel.
|
||||
*
|
||||
* So rap, trap, entrap end with a short syllable, and ow, on, at are classed as short syllables.
|
||||
* But uproot, bestow, disturb do not end with a short syllable.
|
||||
*/
|
||||
private function searchShortSyllabe($from, $nbLetters)
|
||||
{
|
||||
$length = Utf8::strlen($this->word);
|
||||
|
||||
if ($from < 0) {
|
||||
$from = $length + $from;
|
||||
}
|
||||
if ($from < 0) {
|
||||
$from = 0;
|
||||
}
|
||||
|
||||
// (a) is just for beginning of the word
|
||||
if ( ($nbLetters == 2) && ($from != 0) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$first = Utf8::substr($this->word, $from, 1);
|
||||
$second = Utf8::substr($this->word, ($from+1), 1);
|
||||
|
||||
if ($nbLetters == 2) {
|
||||
if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
$third = Utf8::substr($this->word, ($from+2), 1);
|
||||
|
||||
if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels))
|
||||
&& (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
530
libraries/vendor/wamania/php-stemmer/src/French.php
vendored
Normal file
530
libraries/vendor/wamania/php-stemmer/src/French.php
vendored
Normal file
|
@ -0,0 +1,530 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/french/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class French extends Stem
|
||||
{
|
||||
/**
|
||||
* All french vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'â', 'à', 'ë', 'é', 'ê', 'è', 'ï', 'î', 'ô', 'û', 'ù');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
$this->step0();
|
||||
|
||||
$this->rv();
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
// to know if step1, 2a or 2b have altered the word
|
||||
$this->originalWord = $this->word;
|
||||
|
||||
$nextStep = $this->step1();
|
||||
|
||||
// Do step 2a if either no ending was removed by step 1, or if one of endings amment, emment, ment, ments was found.
|
||||
if ( ($nextStep == 2) || ($this->originalWord == $this->word) ) {
|
||||
$modified = $this->step2a();
|
||||
if (!$modified) {
|
||||
$this->step2b();
|
||||
}
|
||||
}
|
||||
|
||||
if ($this->word != $this->originalWord) {
|
||||
$this->step3();
|
||||
|
||||
} else {
|
||||
$this->step4();
|
||||
}
|
||||
|
||||
$this->step5();
|
||||
$this->step6();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Assume the word is in lower case.
|
||||
* Then put into upper case u or i preceded and followed by a vowel, and y preceded or followed by a vowel.
|
||||
* u after q is also put into upper case. For example,
|
||||
* jouer -> joUer
|
||||
* ennuie -> ennuIe
|
||||
* yeux -> Yeux
|
||||
* quand -> qUand
|
||||
*/
|
||||
private function step0()
|
||||
{
|
||||
$this->word = preg_replace('#([q])u#u', '$1U', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
|
||||
$this->word = preg_replace('#y(['.$this->plainVowels.'])#u', 'Y$1', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
* Search for the longest among the following suffixes, and perform the action indicated.
|
||||
*
|
||||
* @return integer Next step number
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// ance iqUe isme able iste eux ances iqUes ismes ables istes
|
||||
// delete if in R2
|
||||
if ( ($position = $this->search(array('ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe','isme', 'able', 'iste', 'eux'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// atrice ateur ation atrices ateurs ations
|
||||
// delete if in R2
|
||||
// if preceded by ic, delete if in R2, else replace by iqU
|
||||
if ( ($position = $this->search(array('atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->searchIfInR2(array('ic'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
} else {
|
||||
$this->word = preg_replace('#(ic)$#u', 'iqU', $this->word);
|
||||
}
|
||||
}
|
||||
|
||||
return 3;
|
||||
}
|
||||
|
||||
// logie logies
|
||||
// replace with log if in R2
|
||||
if ( ($position = $this->search(array('logies', 'logie'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(logies|logie)$#u', 'log', $this->word);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// usion ution usions utions
|
||||
// replace with u if in R2
|
||||
if ( ($position = $this->search(array('usions', 'utions', 'usion', 'ution'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(usion|ution|usions|utions)$#u', 'u', $this->word);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// ence ences
|
||||
// replace with ent if in R2
|
||||
if ( ($position = $this->search(array('ences', 'ence'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(ence|ences)$#u', 'ent', $this->word);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// issement issements
|
||||
// delete if in R1 and preceded by a non-vowel
|
||||
if ( ($position = $this->search(array('issements', 'issement'))) != false) {
|
||||
if ($this->inR1($position)) {
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
if (! in_array($letter, self::$vowels)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// ement ements
|
||||
// delete if in RV
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
// if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise,
|
||||
// if preceded by abl or iqU, delete if in R2, otherwise,
|
||||
// if preceded by ièr or Ièr, replace by i if in RV
|
||||
if ( ($position = $this->search(array('ements', 'ement'))) !== false) {
|
||||
|
||||
// delete if in RV
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
if ( ($position = $this->searchIfInR2(array('iv'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
|
||||
// if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise,
|
||||
} elseif ( ($position = $this->search(array('eus'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
} elseif ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(eus)$#u', 'eux', $this->word);
|
||||
}
|
||||
|
||||
// if preceded by abl or iqU, delete if in R2, otherwise,
|
||||
} elseif ( ($position = $this->searchIfInR2(array('abl', 'iqU'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
// if preceded by ièr or Ièr, replace by i if in RV
|
||||
} elseif ( ($position = $this->searchIfInRv(array('ièr', 'Ièr'))) !== false) {
|
||||
$this->word = preg_replace('#(ièr|Ièr)$#u', 'i', $this->word);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// ité ités
|
||||
// delete if in R2
|
||||
// if preceded by abil, delete if in R2, else replace by abl, otherwise,
|
||||
// if preceded by ic, delete if in R2, else replace by iqU, otherwise,
|
||||
// if preceded by iv, delete if in R2
|
||||
if ( ($position = $this->search(array('ités', 'ité'))) !== false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by abil, delete if in R2, else replace by abl, otherwise,
|
||||
if ( ($position = $this->search(array('abil'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
} else {
|
||||
$this->word = preg_replace('#(abil)$#u', 'abl', $this->word);
|
||||
}
|
||||
|
||||
// if preceded by ic, delete if in R2, else replace by iqU, otherwise,
|
||||
} elseif ( ($position = $this->search(array('ic'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
} else {
|
||||
$this->word = preg_replace('#(ic)$#u', 'iqU', $this->word);
|
||||
}
|
||||
|
||||
// if preceded by iv, delete if in R2
|
||||
} elseif ( ($position = $this->searchIfInR2(array('iv'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return 3;
|
||||
}
|
||||
|
||||
// if ive ifs ives
|
||||
// delete if in R2
|
||||
// if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2, else replace by iqU)
|
||||
if ( ($position = $this->search(array('ifs', 'ives', 'if', 'ive'))) !== false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->search(array('ic'))) !== false) {
|
||||
if ($this->inR2($position2)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
} else {
|
||||
$this->word = preg_replace('#(ic)$#u', 'iqU', $this->word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 3;
|
||||
}
|
||||
|
||||
// eaux
|
||||
// replace with eau
|
||||
if ( ($position = $this->search(array('eaux'))) !== false) {
|
||||
$this->word = preg_replace('#(eaux)$#u', 'eau', $this->word);
|
||||
return 3;
|
||||
}
|
||||
|
||||
// aux
|
||||
// replace with al if in R1
|
||||
if ( ($position = $this->search(array('aux'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(aux)$#u', 'al', $this->word);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// euse euses
|
||||
// delete if in R2, else replace by eux if in R1
|
||||
if ( ($position = $this->search(array('euses', 'euse'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
} elseif ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word);
|
||||
//return 3;
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// amment
|
||||
// replace with ant if in RV
|
||||
if ( ($position = $this->search(array('amment'))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = preg_replace('#(amment)$#u', 'ant', $this->word);
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
// emment
|
||||
// replace with ent if in RV
|
||||
if ( ($position = $this->search(array('emment'))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = preg_replace('#(emment)$#u', 'ent', $this->word);
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
// ment ments
|
||||
// delete if preceded by a vowel in RV
|
||||
if ( ($position = $this->search(array('ments', 'ment'))) != false) {
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
if ( $this->inRv($before) && (in_array($letter, self::$vowels)) ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2a: Verb suffixes beginning i
|
||||
* In steps 2a and 2b all tests are confined to the RV region.
|
||||
* Search for the longest among the following suffixes and if found, delete if preceded by a non-vowel.
|
||||
* îmes ît îtes i ie ies ir ira irai iraIent irais irait iras irent irez iriez
|
||||
* irions irons iront is issaIent issais issait issant issante issantes issants isse
|
||||
* issent isses issez issiez issions issons it
|
||||
* (Note that the non-vowel itself must also be in RV.)
|
||||
*/
|
||||
private function step2a()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'îmes', 'îtes', 'ît', 'ies', 'ie', 'iraIent', 'irais', 'irait', 'irai', 'iras', 'ira', 'irent', 'irez', 'iriez',
|
||||
'irions', 'irons', 'iront', 'ir', 'issaIent', 'issais', 'issait', 'issant', 'issantes', 'issante', 'issants',
|
||||
'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'))) !== false) {
|
||||
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
if ( $this->inRv($before) && (!in_array($letter, self::$vowels)) ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Do step 2b if step 2a was done, but failed to remove a suffix.
|
||||
* Step 2b: Other verb suffixes
|
||||
*/
|
||||
private function step2b()
|
||||
{
|
||||
// é ée ées és èrent er era erai eraIent erais erait eras erez eriez erions erons eront ez iez
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez',
|
||||
'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez','és', 'ée', 'é'))) !== false) {
|
||||
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// âmes ât âtes a ai aIent ais ait ant ante antes ants as asse assent asses assiez assions
|
||||
// delete
|
||||
// if preceded by e, delete
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'âmes', 'âtes', 'ât', 'aIent', 'ais', 'ait', 'antes', 'ante', 'ants', 'ant',
|
||||
'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'))) !== false) {
|
||||
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
if ( $this->inRv($before) && ($letter == 'e') ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $before);
|
||||
|
||||
} else {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ions
|
||||
// delete if in R2
|
||||
if ( ($position = $this->searchIfInRv(array('ions'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: Replace final Y with i or final ç with c
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
$this->word = preg_replace('#(Y)$#u', 'i', $this->word);
|
||||
$this->word = preg_replace('#(ç)$#u', 'c', $this->word);
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: Residual suffix
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
//If the word ends s, not preceded by a, i, o, u, è or s, delete it.
|
||||
if (preg_match('#[^aiouès]s$#', $this->word)) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
|
||||
// In the rest of step 4, all tests are confined to the RV region.
|
||||
// ion
|
||||
// delete if in R2 and preceded by s or t
|
||||
if ( (($position = $this->searchIfInRv(array('ion'))) !== false) && ($this->inR2($position)) ) {
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
if ( $this->inRv($before) && (($letter == 's') || ($letter == 't')) ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ier ière Ier Ière
|
||||
// replace with i
|
||||
if ( ($this->searchIfInRv(array('ier', 'ière', 'Ier', 'Ière'))) !== false) {
|
||||
$this->word = preg_replace('#(ier|ière|Ier|Ière)$#u', 'i', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// e
|
||||
// delete
|
||||
if ( ($this->searchIfInRv(array('e'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ë
|
||||
// if preceded by gu, delete
|
||||
if ( ($position = $this->searchIfInRv(array('guë'))) !== false) {
|
||||
if ($this->inRv($position+2)) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 5: Undouble
|
||||
* If the word ends enn, onn, ett, ell or eill, delete the last letter
|
||||
*/
|
||||
private function step5()
|
||||
{
|
||||
if ($this->search(array('enn', 'onn', 'ett', 'ell', 'eill')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 6: Un-accent
|
||||
* If the words ends é or è followed by at least one non-vowel, remove the accent from the e.
|
||||
*/
|
||||
private function step6()
|
||||
{
|
||||
$this->word = preg_replace('#(é|è)([^'.$this->plainVowels.']+)$#u', 'e$2', $this->word);
|
||||
}
|
||||
|
||||
/**
|
||||
* And finally:
|
||||
* Turn any remaining I, U and Y letters in the word back into lower case.
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
$this->word = Utf8::str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word);
|
||||
}
|
||||
|
||||
/**
|
||||
* If the word begins with two vowels, RV is the region after the third letter,
|
||||
* otherwise the region after the first vowel not at the beginning of the word,
|
||||
* or the end of the word if these positions cannot be found.
|
||||
* (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.)
|
||||
*/
|
||||
protected function rv()
|
||||
{
|
||||
$length = Utf8::strlen($this->word);
|
||||
|
||||
$this->rv = '';
|
||||
$this->rvIndex = $length;
|
||||
|
||||
if ($length < 3) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the word begins with two vowels, RV is the region after the third letter
|
||||
$first = Utf8::substr($this->word, 0, 1);
|
||||
$second = Utf8::substr($this->word, 1, 1);
|
||||
|
||||
if ( (in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) ) {
|
||||
$this->rv = Utf8::substr($this->word, 3);
|
||||
$this->rvIndex = 3;
|
||||
return true;
|
||||
}
|
||||
|
||||
// (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.)
|
||||
$begin3 = Utf8::substr($this->word, 0, 3);
|
||||
if (in_array($begin3, array('par', 'col', 'tap'))) {
|
||||
$this->rv = Utf8::substr($this->word, 3);
|
||||
$this->rvIndex = 3;
|
||||
return true;
|
||||
}
|
||||
|
||||
// otherwise the region after the first vowel not at the beginning of the word,
|
||||
for ($i=1; $i<$length; $i++) {
|
||||
$letter = Utf8::substr($this->word, $i, 1);
|
||||
if (in_array($letter, self::$vowels)) {
|
||||
$this->rv = Utf8::substr($this->word, ($i + 1));
|
||||
$this->rvIndex = $i + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
213
libraries/vendor/wamania/php-stemmer/src/German.php
vendored
Normal file
213
libraries/vendor/wamania/php-stemmer/src/German.php
vendored
Normal file
|
@ -0,0 +1,213 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/german/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class German extends Stem
|
||||
{
|
||||
/**
|
||||
* All German vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü');
|
||||
|
||||
protected static $sEndings = array('b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r' ,'t');
|
||||
|
||||
protected static $stEndings = array('b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
// First, replace ß by ss
|
||||
$this->word = Utf8::str_replace('ß', 'ss', $this->word);
|
||||
|
||||
// put u and y between vowels into upper case
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])y(['.$this->plainVowels.'])#u', '$1Y$2', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
|
||||
|
||||
// R1 and R2 are first set up in the standard way
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
// but then R1 is adjusted so that the region before it contains at least 3 letters.
|
||||
if ($this->r1Index < 3) {
|
||||
$this->r1Index = 3;
|
||||
$this->r1 = Utf8::substr($this->word, 3);
|
||||
}
|
||||
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
*/
|
||||
public function step1()
|
||||
{
|
||||
// delete if in R1
|
||||
if ( ($position = $this->search(array('em', 'ern', 'er'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// delete if in R1
|
||||
if ( ($position = $this->search(array('es', 'en', 'e'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
//If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s
|
||||
if ($this->search(array('niss')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// s (preceded by a valid s-ending)
|
||||
if ( ($position = $this->search(array('s'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
|
||||
if (in_array($letter, self::$sEndings)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
*/
|
||||
public function step2()
|
||||
{
|
||||
// en er est
|
||||
// delete if in R1
|
||||
if ( ($position = $this->search(array('en', 'er', 'est'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// st (preceded by a valid st-ending, itself preceded by at least 3 letters)
|
||||
// delete if in R1
|
||||
if ( ($position = $this->search(array('st'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$before = $position - 1;
|
||||
if ($before >= 3) {
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
|
||||
if (in_array($letter, self::$stEndings)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: d-suffixes
|
||||
*/
|
||||
public function step3()
|
||||
{
|
||||
// end ung
|
||||
// delete if in R2
|
||||
// if preceded by ig, delete if in R2 and not preceded by e
|
||||
if ( ($position = $this->search(array('end', 'ung'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->search(array('ig'))) !== false) {
|
||||
$before = $position2 - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
|
||||
if ( ($this->inR2($position2)) && ($letter != 'e') ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ig ik isch
|
||||
// delete if in R2 and not preceded by e
|
||||
if ( ($position = $this->search(array('ig', 'ik', 'isch'))) !== false) {
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
|
||||
if ( ($this->inR2($position)) && ($letter != 'e') ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// lich heit
|
||||
// delete if in R2
|
||||
// if preceded by er or en, delete if in R1
|
||||
if ( ($position = $this->search(array('lich', 'heit'))) != false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->search(array('er', 'en'))) !== false) {
|
||||
if ($this->inR1($position2)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// keit
|
||||
// delete if in R2
|
||||
// if preceded by lich or ig, delete if in R2
|
||||
if ( ($position = $this->search(array('keit'))) != false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->search(array('lich', 'ig'))) !== false) {
|
||||
if ($this->inR2($position2)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finally
|
||||
*/
|
||||
public function finish()
|
||||
{
|
||||
// turn U and Y back into lower case, and remove the umlaut accent from a, o and u.
|
||||
$this->word = Utf8::str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word);
|
||||
}
|
||||
}
|
286
libraries/vendor/wamania/php-stemmer/src/Italian.php
vendored
Normal file
286
libraries/vendor/wamania/php-stemmer/src/Italian.php
vendored
Normal file
|
@ -0,0 +1,286 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/italian/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Italian extends Stem
|
||||
{
|
||||
/**
|
||||
* All Italian vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'à', 'è', 'ì', 'ò', 'ù');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
// First, replace all acute accents by grave accents.
|
||||
$this->word = Utf8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word);
|
||||
|
||||
//And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then
|
||||
$this->word = preg_replace('#([q])u#u', '$1U', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
|
||||
|
||||
$this->rv();
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
$this->step0();
|
||||
|
||||
$word = $this->word;
|
||||
$this->step1();
|
||||
|
||||
//Do step 2 if no ending was removed by step 1.
|
||||
if ($word == $this->word) {
|
||||
$this->step2();
|
||||
}
|
||||
|
||||
$this->step3a();
|
||||
$this->step3b();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 0: Attached pronoun
|
||||
*/
|
||||
private function step0()
|
||||
{
|
||||
// Search for the longest among the following suffixes
|
||||
if ( ($position = $this->search(array(
|
||||
'gliela', 'gliele', 'glieli', 'glielo', 'gliene',
|
||||
'sene', 'mela', 'mele', 'meli', 'melo', 'mene', 'tela', 'tele', 'teli', 'telo', 'tene', 'cela',
|
||||
'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene',
|
||||
'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'ci'))) !== false) {
|
||||
|
||||
$suffixe = Utf8::substr($this->word, $position);
|
||||
|
||||
// following one of (in RV)
|
||||
// a
|
||||
$a = array('ando', 'endo');
|
||||
$a = array_map(function($item) use ($suffixe) {
|
||||
return $item . $suffixe;
|
||||
}, $a);
|
||||
// In case of (a) the suffix is deleted
|
||||
if ($this->searchIfInRv($a) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
//b
|
||||
$b = array('ar', 'er', 'ir');
|
||||
$b = array_map(function($item) use ($suffixe) {
|
||||
return $item . $suffixe;
|
||||
}, $b);
|
||||
// in case (b) it is replace by e
|
||||
if ($this->searchIfInRv($b) !== false) {
|
||||
$this->word = preg_replace('#('.$suffixe.')$#u', 'e', $this->word);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1: Standard suffix removal
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// amente
|
||||
// delete if in R1
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
// if preceded by os, ic or abil, delete if in R2
|
||||
if ( ($position = $this->search(array('amente'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position3);
|
||||
}
|
||||
|
||||
// if preceded by os, ic or ad, delete if in R2
|
||||
} elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'abil'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position4);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// delete if in R2
|
||||
if ( ($position = $this->search(array(
|
||||
'ibili', 'atrice', 'abili', 'abile', 'ibile', 'atrici', 'mente',
|
||||
'anza', 'anze', 'iche', 'ichi', 'ismo', 'ismi', 'ista', 'iste', 'isti', 'istà', 'istè', 'istì', 'ante', 'anti',
|
||||
'ico', 'ici', 'ica', 'ice', 'oso', 'osi', 'osa', 'ose'
|
||||
))) !== false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// azione azioni atore atori
|
||||
// delete if in R2
|
||||
// if preceded by ic, delete if in R2
|
||||
if ( ($position = $this->search(array('azione', 'azioni', 'atore', 'atori'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->search(array('ic'))) !== false) {
|
||||
if ($this->inR2($position2)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// logia logie
|
||||
// replace with log if in R2
|
||||
if ( ($position = $this->search(array('logia', 'logie'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(logia|logie)$#u', 'log', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// uzione uzioni usione usioni
|
||||
// replace with u if in R2
|
||||
if ( ($position = $this->search(array('uzione', 'uzioni', 'usione', 'usioni'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(uzione|uzioni|usione|usioni)$#u', 'u', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// enza enze
|
||||
// replace with ente if in R2
|
||||
if ( ($position = $this->search(array('enza', 'enze'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(enza|enze)$#u', 'ente', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// amento amenti imento imenti
|
||||
// delete if in RV
|
||||
if ( ($position = $this->search(array('amento', 'amenti', 'imento', 'imenti'))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ità
|
||||
// delete if in R2
|
||||
// if preceded by abil, ic or iv, delete if in R2
|
||||
if ( ($position = $this->search(array('ità'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ivo ivi iva ive
|
||||
// delete if in R2
|
||||
// if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2)
|
||||
if ( ($position = $this->search(array('ivo', 'ivi', 'iva', 'ive'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
if ( ($position3 = $this->searchIfInR2(array('ic'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position3);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2: Verb suffixes
|
||||
* Search for the longest among the following suffixes in RV, and if found, delete.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'assimo', 'assero', 'eranno', 'erebbero', 'erebbe', 'eremmo', 'ereste', 'eresti', 'essero', 'iranno', 'irebbero', 'irebbe', 'iremmo',
|
||||
'iscano', 'ireste', 'iresti', 'iscono', 'issero',
|
||||
'avamo', 'arono', 'avano', 'avate', 'eremo', 'erete', 'erono', 'evamo', 'evano', 'evate', 'ivamo', 'ivano', 'ivate', 'iremo', 'irete', 'irono',
|
||||
'ammo', 'ando', 'asse', 'assi', 'emmo', 'enda', 'ende', 'endi', 'endo', 'erai', 'erei', 'Yamo', 'iamo', 'immo', 'irà', 'irai', 'irei',
|
||||
'isca', 'isce', 'isci', 'isco',
|
||||
'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', 'erà', 'ere', 'erò', 'ete', 'eva',
|
||||
'evi', 'evo', 'ire', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'irò', 'ar', 'ir'))) !== false) {
|
||||
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3a
|
||||
* Delete a final a, e, i, o, à, è, ì or ò if it is in RV, and a preceding i if it is in RV
|
||||
*/
|
||||
private function step3a()
|
||||
{
|
||||
if ($this->searchIfInRv(array('a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
|
||||
if ($this->searchIfInRv(array('i')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3b
|
||||
* Replace final ch (or gh) with c (or g) if in RV (crocch -> crocc)
|
||||
*/
|
||||
private function step3b()
|
||||
{
|
||||
if ($this->searchIfInRv(array('ch')) !== false) {
|
||||
$this->word = preg_replace('#(ch)$#u', 'c', $this->word);
|
||||
|
||||
} elseif ($this->searchIfInRv(array('gh')) !== false) {
|
||||
$this->word = preg_replace('#(gh)$#u', 'g', $this->word);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finally
|
||||
* turn I and U back into lower case
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
$this->word = Utf8::str_replace(array('I', 'U'), array('i', 'u'), $this->word);
|
||||
}
|
||||
}
|
127
libraries/vendor/wamania/php-stemmer/src/Norwegian.php
vendored
Normal file
127
libraries/vendor/wamania/php-stemmer/src/Norwegian.php
vendored
Normal file
|
@ -0,0 +1,127 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Norwegian extends Stem
|
||||
{
|
||||
/**
|
||||
* All norwegian vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'æ', 'å', 'ø');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
// R2 is not used: R1 is defined in the same way as in the German stemmer
|
||||
$this->r1();
|
||||
|
||||
// then R1 is adjusted so that the region before it contains at least 3 letters.
|
||||
if ($this->r1Index < 3) {
|
||||
$this->r1Index = 3;
|
||||
$this->r1 = Utf8::substr($this->word, 3);
|
||||
}
|
||||
|
||||
// Do each of steps 1, 2 3 and 4.
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a valid s-ending as one of
|
||||
* b c d f g h j l m n o p r t v y z,
|
||||
* or k not preceded by a vowel
|
||||
*
|
||||
* @param string $ending
|
||||
* @return boolean
|
||||
*/
|
||||
private function hasValidSEnding($word)
|
||||
{
|
||||
$lastLetter = Utf8::substr($word, -1, 1);
|
||||
if (in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z'))) {
|
||||
return true;
|
||||
}
|
||||
if ($lastLetter == 'k') {
|
||||
$beforeLetter = Utf8::substr($word, -2, 1);
|
||||
if (!in_array($beforeLetter, self::$vowels)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
* Search for the longest among the following suffixes in R1, and perform the action indicated.
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// erte ert
|
||||
// replace with er
|
||||
if ( ($position = $this->searchIfInR1(array('erte', 'ert'))) !== false) {
|
||||
$this->word = preg_replace('#(erte|ert)$#u', 'er', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// a e ede ande ende ane ene hetene en heten ar er heter as es edes endes enes hetenes ens hetens ers ets et het ast
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInR1(array(
|
||||
'hetenes', 'hetene', 'hetens', 'heten', 'endes', 'heter', 'ande', 'ende', 'enes', 'edes', 'ede', 'ane',
|
||||
'ene', 'het', 'ers', 'ets', 'ast', 'ens', 'en', 'ar', 'er', 'as', 'es', 'et', 'a', 'e'
|
||||
))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// s
|
||||
// delete if preceded by a valid s-ending
|
||||
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
|
||||
$word = Utf8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidSEnding($word)) {
|
||||
$this->word = $word;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
* If the word ends dt or vt in R1, delete the t.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
if ($this->searchIfInR1(array('dt', 'vt')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3:
|
||||
* Search for the longest among the following suffixes in R1, and if found, delete.
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// leg eleg ig eig lig elig els lov elov slov hetslov
|
||||
if ( ($position = $this->searchIfInR1(array(
|
||||
'hetslov', 'eleg', 'elov', 'slov', 'elig', 'eig', 'lig', 'els', 'lov', 'leg', 'ig'
|
||||
))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
}
|
280
libraries/vendor/wamania/php-stemmer/src/Portuguese.php
vendored
Normal file
280
libraries/vendor/wamania/php-stemmer/src/Portuguese.php
vendored
Normal file
|
@ -0,0 +1,280 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/portuguese/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Portuguese extends Stem
|
||||
{
|
||||
/**
|
||||
* All Portuguese vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'â', 'ê', 'ô');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
$this->word = Utf8::str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word);
|
||||
|
||||
$this->rv();
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
$word = $this->word;
|
||||
$this->step1();
|
||||
|
||||
if ($word == $this->word) {
|
||||
$this->step2();
|
||||
}
|
||||
|
||||
if ($word != $this->word) {
|
||||
$this->step3();
|
||||
} else {
|
||||
$this->step4();
|
||||
}
|
||||
|
||||
$this->step5();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1: Standard suffix removal
|
||||
*/
|
||||
public function step1()
|
||||
{
|
||||
// delete if in R2
|
||||
if ( ($position = $this->search(array(
|
||||
'amentos', 'imentos', 'adoras', 'adores', 'amento', 'imento', 'adora', 'istas', 'ismos', 'antes', 'ância',
|
||||
'ezas', 'eza', 'icos', 'icas', 'ismo', 'ável', 'ível', 'ista', 'oso',
|
||||
'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es' , 'ante'))) !== false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// logía logías
|
||||
// replace with log if in R2
|
||||
if ( ($position = $this->search(array('logías', 'logía'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(logías|logía)$#u', 'log', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ución uciones
|
||||
// replace with u if in R2
|
||||
if ( ($position = $this->search(array('uciones', 'ución'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(uciones|ución)$#u', 'u', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ência ências
|
||||
// replace with ente if in R2
|
||||
if ( ($position = $this->search(array('ências', 'ência'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(ências|ência)$#u', 'ente', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// amente
|
||||
// delete if in R1
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
// if preceded by os, ic or ad, delete if in R2
|
||||
if ( ($position = $this->search(array('amente'))) !== false) {
|
||||
|
||||
// delete if in R1
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position3);
|
||||
}
|
||||
|
||||
// if preceded by os, ic or ad, delete if in R2
|
||||
} elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position4);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// mente
|
||||
// delete if in R2
|
||||
// if preceded by ante, avel or ível, delete if in R2
|
||||
if ( ($position = $this->search(array('mente'))) !== false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by ante, avel or ível, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('ante', 'avel', 'ível'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// idade idades
|
||||
// delete if in R2
|
||||
// if preceded by abil, ic or iv, delete if in R2
|
||||
if ( ($position = $this->search(array('idades', 'idade'))) !== false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by abil, ic or iv, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// iva ivo ivas ivos
|
||||
// delete if in R2
|
||||
// if preceded by at, delete if in R2
|
||||
if ( ($position = $this->search(array('ivas', 'ivos', 'iva', 'ivo'))) !== false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by at, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ira iras
|
||||
// replace with ir if in RV and preceded by e
|
||||
if ( ($position = $this->search(array('iras', 'ira'))) !== false) {
|
||||
|
||||
if ($this->inRv($position)) {
|
||||
$before = $position -1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
|
||||
if ($letter == 'e') {
|
||||
$this->word = preg_replace('#(iras|ira)$#u', 'ir', $this->word);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2: Verb suffixes
|
||||
* Search for the longest among the following suffixes in RV, and if found, delete.
|
||||
*/
|
||||
public function step2()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'aríamos', 'eríamos', 'iríamos', 'ássemos', 'êssemos', 'íssemos',
|
||||
'aríeis', 'eríeis', 'iríeis', 'ásseis', 'ésseis', 'ísseis', 'áramos', 'éramos', 'íramos', 'ávamos',
|
||||
'aremos', 'eremos', 'iremos',
|
||||
'ariam', 'eriam', 'iriam', 'assem', 'essem', 'issem', 'arias', 'erias', 'irias', 'ardes', 'erdes', 'irdes',
|
||||
'asses', 'esses', 'isses', 'astes', 'estes', 'istes', 'áreis', 'areis', 'éreis', 'ereis', 'íreis', 'ireis',
|
||||
'áveis', 'íamos', 'armos', 'ermos', 'irmos',
|
||||
'aria', 'eria', 'iria', 'asse', 'esse', 'isse', 'aste', 'este', 'iste', 'arei', 'erei', 'irei', 'adas', 'idas',
|
||||
'aram', 'eram', 'iram', 'avam', 'arem', 'erem', 'irem', 'ando', 'endo', 'indo', 'ara~o', 'era~o', 'ira~o',
|
||||
'arás', 'aras', 'erás', 'eras', 'irás', 'avas', 'ares', 'eres', 'ires', 'íeis', 'ados', 'idos', 'ámos', 'amos',
|
||||
'emos', 'imos', 'iras',
|
||||
'ada', 'ida', 'ará', 'ara', 'erá', 'era', 'irá', 'ava', 'iam', 'ado', 'ido', 'ias', 'ais', 'eis', 'ira',
|
||||
'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou',
|
||||
))) !== false) {
|
||||
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: d-suffixes
|
||||
*
|
||||
*/
|
||||
public function step3()
|
||||
{
|
||||
// Delete suffix i if in RV and preceded by c
|
||||
if ($this->searchIfInRv(array('i')) !== false) {
|
||||
$letter = Utf8::substr($this->word, -2, 1);
|
||||
|
||||
if ($letter == 'c') {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4
|
||||
*/
|
||||
public function step4()
|
||||
{
|
||||
// If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it
|
||||
if ( ($position = $this->searchIfInRv(array('os', 'a', 'i', 'o','á', 'í', 'ó'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 5
|
||||
*/
|
||||
public function step5()
|
||||
{
|
||||
// If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i).
|
||||
if ($this->searchIfInRv(array('e', 'é', 'ê')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
|
||||
if ( ($position2 = $this->search(array('gu', 'ci'))) !== false) {
|
||||
if ($this->inRv(($position2+1))) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else if ($this->search(array('ç')) !== false) {
|
||||
$this->word = preg_replace('#(ç)$#u', 'c', $this->word);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finally
|
||||
*/
|
||||
public function finish()
|
||||
{
|
||||
// turn U and Y back into lower case, and remove the umlaut accent from a, o and u.
|
||||
$this->word = Utf8::str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word);
|
||||
}
|
||||
}
|
331
libraries/vendor/wamania/php-stemmer/src/Romanian.php
vendored
Normal file
331
libraries/vendor/wamania/php-stemmer/src/Romanian.php
vendored
Normal file
|
@ -0,0 +1,331 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/romanian/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Romanian extends Stem
|
||||
{
|
||||
/**
|
||||
* All Romanian vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'ă', 'â', 'e', 'i', 'î', 'o', 'u');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
// First, i and u between vowels are put into upper case (so that they are treated as consonants).
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
|
||||
|
||||
$this->rv();
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
$this->step0();
|
||||
|
||||
$word1 = $this->word;
|
||||
$word2 = $this->word;
|
||||
|
||||
do {
|
||||
$word1 = $this->word;
|
||||
$this->step1();
|
||||
} while ($this->word != $word1);
|
||||
|
||||
$this->step2();
|
||||
|
||||
// Do step 3 if no suffix was removed either by step 1 or step 2.
|
||||
if ($word2 == $this->word) {
|
||||
$this->step3();
|
||||
}
|
||||
|
||||
$this->step4();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 0: Removal of plurals (and other simplifications)
|
||||
* Search for the longest among the following suffixes, and, if it is in R1, perform the action indicated.
|
||||
* @return boolean
|
||||
*/
|
||||
public function step0()
|
||||
{
|
||||
// ul ului
|
||||
// delete
|
||||
if ( ($position = $this->search(array('ul', 'ului'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// aua
|
||||
// replace with a
|
||||
if ( ($position = $this->search(array('aua'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(aua)$#u', 'a', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ea ele elor
|
||||
// replace with e
|
||||
if ( ($position = $this->search(array('ea', 'ele', 'elor'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ea|ele|elor)$#u', 'e', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ii iua iei iile iilor ilor
|
||||
// replace with i
|
||||
if ( ($position = $this->search(array('ii', 'iua', 'iei', 'iile', 'iilor', 'ilor'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ii|iua|iei|iile|iilor|ilor)$#u', 'i', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ile
|
||||
// replace with i if not preceded by ab
|
||||
if ( ($position = $this->search(array('ile'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$before = Utf8::substr($this->word, ($position-2), 2);
|
||||
|
||||
if ($before != 'ab') {
|
||||
$this->word = preg_replace('#(ile)$#u', 'i', $this->word);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// atei
|
||||
// replace with at
|
||||
if ( ($position = $this->search(array('atei'))) != false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(atei)$#u', 'at', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// aţie aţia
|
||||
// replace with aţi
|
||||
if ( ($position = $this->search(array('aţie', 'aţia'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(aţie|aţia)$#u', 'aţi', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1: Reduction of combining suffixes
|
||||
* Search for the longest among the following suffixes, and, if it is in R1, preform the replacement action indicated.
|
||||
* Then repeat this step until no replacement occurs.
|
||||
* @return boolean
|
||||
*/
|
||||
public function step1()
|
||||
{
|
||||
// abilitate abilitati abilităi abilităţi
|
||||
// replace with abil
|
||||
if ( ($position = $this->search(array('abilitate', 'abilitati', 'abilităi', 'abilităţi'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(abilitate|abilitati|abilităi|abilităţi)$#u', 'abil', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ibilitate
|
||||
// replace with ibil
|
||||
if ( ($position = $this->search(array('ibilitate'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ibilitate)$#u', 'ibil', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ivitate ivitati ivităi ivităţi
|
||||
// replace with iv
|
||||
if ( ($position = $this->search(array('ivitate', 'ivitati', 'ivităi', 'ivităţi'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ivitate|ivitati|ivităi|ivităţi)$#u', 'iv', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// icitate icitati icităi icităţi icator icatori iciv iciva icive icivi icivă ical icala icale icali icală
|
||||
// replace with ic
|
||||
if ( ($position = $this->search(array(
|
||||
'icitate', 'icitati', 'icităi', 'icităţi', 'icatori', 'icator', 'iciva',
|
||||
'icive', 'icivi', 'icivă', 'icala', 'icale', 'icali', 'icală', 'iciv', 'ical'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(icitate|icitati|icităi|icităţi|cator|icatori|iciva|icive|icivi|icivă|icala|icale|icali|icală|ical|iciv)$#u', 'ic', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ativ ativa ative ativi ativă aţiune atoare ator atori ătoare ător ători
|
||||
// replace with at
|
||||
if ( ($position = $this->search(array('ativa', 'ative', 'ativi', 'ativă', 'ativ', 'aţiune', 'atoare', 'atori', 'ătoare', 'ători', 'ător', 'ator'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ativa|ative|ativi|ativă|ativ|aţiune|atoare|atori|ătoare|ători|ător|ator)$#u', 'at', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// itiv itiva itive itivi itivă iţiune itoare itor itori
|
||||
// replace with it
|
||||
if ( ($position = $this->search(array('itiva', 'itive', 'itivi', 'itivă', 'itiv', 'iţiune', 'itoare', 'itori', 'itor'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(itiva|itive|itivi|itivă|itiv|iţiune|itoare|itori|itor)$#u', 'it', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2: Removal of 'standard' suffixes
|
||||
* Search for the longest among the following suffixes, and, if it is in R2, perform the action indicated.
|
||||
* @return boolean
|
||||
*/
|
||||
public function step2()
|
||||
{
|
||||
// atori itate itati, ităţi, abila abile abili abilă, ibila ibile ibili ibilă
|
||||
// anta, ante, anti, antă, ator, ibil, oasa oasă oase, ităi, abil
|
||||
// osi oşi ant ici ică iva ive ivi ivă ata ată ati ate, ata ată ati ate uta ută uti ute, ita ită iti ite ica ice
|
||||
// at, os, iv, ut, it, ic
|
||||
// delete
|
||||
if ( ($position = $this->search(array(
|
||||
'atori', 'itate', 'itati', 'ităţi', 'abila', 'abile', 'abili', 'abilă', 'ibila', 'ibile', 'ibili', 'ibilă',
|
||||
'anta', 'ante', 'anti', 'antă', 'ator', 'ibil', 'oasa', 'oasă', 'oase', 'ităi', 'abil',
|
||||
'osi', 'oşi', 'ant', 'ici', 'ică', 'iva', 'ive', 'ivi', 'ivă', 'ata', 'ată', 'ati', 'ate', 'ata', 'ată',
|
||||
'ati', 'ate', 'uta', 'ută', 'uti', 'ute', 'ita', 'ită', 'iti', 'ite', 'ica', 'ice',
|
||||
'at', 'os', 'iv', 'ut', 'it', 'ic'
|
||||
))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// iune iuni
|
||||
// delete if preceded by ţ, and replace the ţ by t.
|
||||
if ( ($position = $this->search(array('iune', 'iuni'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$before = $position - 1;
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
if ($letter == 'ţ') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->word = preg_replace('#(ţ)$#u', 't', $this->word);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ism isme ist ista iste isti istă işti
|
||||
// replace with ist
|
||||
if ( ($position = $this->search(array('isme', 'ism', 'ista', 'iste', 'isti', 'istă', 'işti', 'ist'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(isme|ism|ista|iste|isti|istă|işti|ist)$#u', 'ist', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: Removal of verb suffixes
|
||||
* Do step 3 if no suffix was removed either by step 1 or step 2.
|
||||
* @return boolean
|
||||
*/
|
||||
public function step3()
|
||||
{
|
||||
// are ere ire âre ind ând indu ându eze ească ez ezi ează esc eşti
|
||||
// eşte ăsc ăşti ăşte am ai au eam eai ea eaţi eau iam iai ia iaţi
|
||||
// iau ui aşi arăm arăţi ară uşi urăm urăţi ură işi irăm irăţi iră âi
|
||||
// âşi ârăm ârăţi âră asem aseşi ase aserăm aserăţi aseră isem iseşi ise
|
||||
// iserăm iserăţi iseră âsem âseşi âse âserăm âserăţi âseră usem useşi use userăm userăţi useră
|
||||
// delete if preceded in RV by a consonant or u
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'userăţi', 'iserăţi', 'âserăţi', 'aserăţi',
|
||||
'userăm', 'iserăm', 'âserăm', 'aserăm',
|
||||
'iseră', 'âseşi', 'useră', 'âseră', 'useşi', 'iseşi', 'aseră', 'aseşi', 'ârăţi', 'irăţi', 'urăţi', 'arăţi', 'ească',
|
||||
'usem', 'âsem', 'isem', 'asem', 'ârăm', 'urăm', 'irăm', 'arăm', 'iaţi', 'eaţi', 'ăşte', 'ăşti', 'eşte', 'eşti', 'ează', 'ându', 'indu',
|
||||
'âse', 'use', 'ise', 'ase', 'âră', 'iră', 'işi', 'ură', 'uşi', 'ară', 'aşi', 'âşi', 'iau', 'iai', 'iam', 'eau', 'eai', 'eam', 'ăsc',
|
||||
'are', 'ere', 'ire', 'âre', 'ind', 'ând', 'eze', 'ezi', 'esc',
|
||||
'âi', 'ui', 'ia', 'ea', 'au', 'ai', 'am', 'ez'
|
||||
))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$before = $position - 1;
|
||||
if ($this->inRv($before)) {
|
||||
$letter = Utf8::substr($this->word, $before, 1);
|
||||
|
||||
if ( (!in_array($letter, self::$vowels)) || ($letter == 'u') ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ăm aţi em eţi im iţi âm âţi seşi serăm serăţi seră sei se sesem seseşi sese seserăm seserăţi seseră
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'seserăm', 'seserăţi', 'seseră', 'seseşi', 'sesem', 'serăţi', 'serăm', 'seşi', 'sese', 'seră',
|
||||
'aţi', 'eţi', 'iţi', 'âţi', 'sei', 'se', 'ăm', 'âm', 'em', 'im'
|
||||
))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: Removal of final vowel
|
||||
*/
|
||||
public function step4()
|
||||
{
|
||||
// Search for the longest among the suffixes "a e i ie ă " and, if it is in RV, delete it.
|
||||
if ( ($position = $this->search(array('a', 'ie', 'e', 'i', 'ă'))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finally
|
||||
* Turn I, U back into i, u
|
||||
*/
|
||||
public function finish()
|
||||
{
|
||||
// Turn I, U back into i, u
|
||||
$this->word = Utf8::str_replace(array('I', 'U'), array('i', 'u'), $this->word);
|
||||
}
|
||||
}
|
249
libraries/vendor/wamania/php-stemmer/src/Russian.php
vendored
Normal file
249
libraries/vendor/wamania/php-stemmer/src/Russian.php
vendored
Normal file
|
@ -0,0 +1,249 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/russian/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Russian extends Stem
|
||||
{
|
||||
/**
|
||||
* All russian vowels
|
||||
*/
|
||||
protected static $vowels = array('а', 'е', 'и', 'о', 'у', 'ы', 'э', 'ю', 'я');
|
||||
|
||||
protected static $perfectiveGerund = array(
|
||||
array('вшись', 'вши', 'в'),
|
||||
array('ывшись', 'ившись', 'ывши', 'ивши', 'ив', 'ыв')
|
||||
);
|
||||
|
||||
protected static $adjective = array(
|
||||
'ыми', 'ими', 'ему', 'ому', 'его', 'ого', 'ее', 'ие', 'ые', 'ое', 'ей', 'ий',
|
||||
'ый', 'ой', 'ем', 'им', 'ым','ом','их', 'ых', 'ую', 'юю', 'ая', 'яя', 'ою', 'ею'
|
||||
);
|
||||
|
||||
protected static $participle = array(
|
||||
array('ем', 'нн', 'вш', 'ющ', 'щ'),
|
||||
array('ивш', 'ывш', 'ующ')
|
||||
);
|
||||
|
||||
protected static $reflexive = array('ся', 'сь');
|
||||
|
||||
protected static $verb = array(
|
||||
array('ешь', 'нно', 'ете', 'йте', 'ла', 'на', 'ли', 'й', 'л', 'ем', 'н', 'ло', 'но', 'ет', 'ют', 'ны', 'ть'),
|
||||
array(
|
||||
'уйте', 'ило', 'ыло', 'ено','ила', 'ыла', 'ена', 'ейте', 'ены', 'ить', 'ыть', 'ишь', 'ите', 'или', 'ыли',
|
||||
'ует', 'уют', 'ей', 'уй', 'ил', 'ыл', 'им', 'ым', 'ен', 'ят', 'ит', 'ыт', 'ую', 'ю'
|
||||
)
|
||||
);
|
||||
|
||||
protected static $noun = array(
|
||||
'иями', 'ями', 'ами', 'ией', 'иям', 'ием', 'иях', 'ев', 'ов', 'ие', 'ье', 'еи', 'ии', 'ей', 'ой', 'ий', 'ям',
|
||||
'ем', 'ам', 'ом', 'ах', 'ях', 'ию', 'ью', 'ия', 'ья', 'я', 'а', 'е', 'ы', 'ь', 'и', 'о', 'у', 'й', 'ю'
|
||||
);
|
||||
|
||||
protected static $superlative = array('ейше', 'ейш');
|
||||
|
||||
protected static $derivational = array('ость', 'ост');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
// R2 is not used: R1 is defined in the same way as in the German stemmer
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
$this->rv();
|
||||
|
||||
// Do each of steps 1, 2 3 and 4.
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
$this->step4();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1.
|
||||
* Otherwise try and remove a REFLEXIVE ending, and then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a NOUN ending.
|
||||
* As soon as one of the endings (1) to (3) is found remove it, and terminate step 1.
|
||||
*/
|
||||
public function step1()
|
||||
{
|
||||
// Search for a PERFECTIVE GERUND ending.
|
||||
// group 1
|
||||
if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[0])) !== false) {
|
||||
if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// group 2
|
||||
if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[1])) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise try and remove a REFLEXIVE ending
|
||||
if ( ($position = $this->searchIfInRv(self::$reflexive)) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
// then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a NOUN ending.
|
||||
// As soon as one of the endings (1) to (3) is found remove it, and terminate step 1.
|
||||
if ( ($position = $this->searchIfInRv(self::$adjective)) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->search(self::$participle[0])) !== false) {
|
||||
if ( ($this->inRv($position2)) && ($this->checkGroup1($position2)) ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->search(self::$participle[1])) !== false) {
|
||||
if ($this->inRv($position2)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ($position = $this->searchIfInRv(self::$verb[0])) !== false) {
|
||||
if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ($position = $this->searchIfInRv(self::$verb[1])) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ($position = $this->searchIfInRv(self::$noun)) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2: If the word ends with и (i), remove it.
|
||||
*/
|
||||
public function step2()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(array('и'))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2),
|
||||
* and if one is found, remove it.
|
||||
*/
|
||||
public function step3()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(self::$derivational)) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: (1) Undouble н (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it
|
||||
* and undouble н (n), or (3) if the word ends ь (') (soft sign) remove it.
|
||||
*/
|
||||
public function step4()
|
||||
{
|
||||
// (2) if the word ends with a SUPERLATIVE ending, remove it
|
||||
if ( ($position = $this->searchIfInRv(self::$superlative)) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// (1) Undouble н (n)
|
||||
if ( ($position = $this->searchIfInRv(array('нн'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, ($position+1));
|
||||
return true;
|
||||
}
|
||||
|
||||
// (3) if the word ends ь (') (soft sign) remove it
|
||||
if ( ($position = $this->searchIfInRv(array('ь'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel.
|
||||
*/
|
||||
protected function rv()
|
||||
{
|
||||
$length = Utf8::strlen($this->word);
|
||||
|
||||
$this->rv = '';
|
||||
$this->rvIndex = $length;
|
||||
|
||||
for ($i=0; $i<$length; $i++) {
|
||||
$letter = Utf8::substr($this->word, $i, 1);
|
||||
if (in_array($letter, self::$vowels)) {
|
||||
$this->rv = Utf8::substr($this->word, ($i+1));
|
||||
$this->rvIndex = $i + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* group 1 endings must follow а (a) or я (ia)
|
||||
*
|
||||
* @param integer $position
|
||||
* @return boolean
|
||||
*/
|
||||
private function checkGroup1($position)
|
||||
{
|
||||
if (! $this->inRv(($position-1))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$letter = Utf8::substr($this->word, ($position - 1), 1);
|
||||
|
||||
if ($letter == 'а' || $letter == 'я') {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
345
libraries/vendor/wamania/php-stemmer/src/Spanish.php
vendored
Normal file
345
libraries/vendor/wamania/php-stemmer/src/Spanish.php
vendored
Normal file
|
@ -0,0 +1,345 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/spanish/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Spanish extends Stem
|
||||
{
|
||||
/**
|
||||
* All spanish vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'ü');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
$this->rv();
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
$this->step0();
|
||||
|
||||
$word = $this->word;
|
||||
$this->step1();
|
||||
|
||||
// Do step 2a if no ending was removed by step 1.
|
||||
if ($this->word == $word) {
|
||||
$this->step2a();
|
||||
|
||||
// Do Step 2b if step 2a was done, but failed to remove a suffix.
|
||||
if ($this->word == $word) {
|
||||
$this->step2b();
|
||||
}
|
||||
}
|
||||
|
||||
$this->step3();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 0: Attached pronoun
|
||||
*
|
||||
* Search for the longest among the following suffixes
|
||||
* me se sela selo selas selos la le lo las les los nos
|
||||
*
|
||||
* and delete it, if comes after one of
|
||||
* (a) iéndo ándo ár ér ír
|
||||
* (b) ando iendo ar er ir
|
||||
* (c) yendo following u
|
||||
*
|
||||
* in RV. In the case of (c), yendo must lie in RV, but the preceding u can be outside it.
|
||||
* In the case of (a), deletion is followed by removing the acute accent (for example, haciéndola -> haciendo).
|
||||
*/
|
||||
private function step0()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(array('selas', 'selos', 'las', 'los', 'les', 'nos', 'selo', 'sela', 'me', 'se', 'la', 'le', 'lo' ))) != false) {
|
||||
$suffixe = Utf8::substr($this->word, $position);
|
||||
|
||||
// a
|
||||
$a = array('iéndo', 'ándo', 'ár', 'ér', 'ír');
|
||||
$a = array_map(function($item) use ($suffixe) {
|
||||
return $item . $suffixe;
|
||||
}, $a);
|
||||
|
||||
if ( ($position2 = $this->searchIfInRv($a)) !== false) {
|
||||
$suffixe2 = Utf8::substr($this->word, $position2);
|
||||
$suffixe2 = Utf8::deaccent($suffixe2, -1);
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
$this->word .= $suffixe2;
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// b
|
||||
$b = array('iendo', 'ando', 'ar', 'er', 'ir');
|
||||
$b = array_map(function($item) use ($suffixe) {
|
||||
return $item . $suffixe;
|
||||
}, $b);
|
||||
|
||||
if ( ($position2 = $this->searchIfInRv($b)) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// c
|
||||
if ( ($position2 = $this->searchIfInRv(array('yendo' . $suffixe))) != false) {
|
||||
$before = Utf8::substr($this->word, ($position2-1), 1);
|
||||
if ( (isset($before)) && ($before == 'u') ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// anza anzas ico ica icos icas ismo ismos able ables ible ibles ista
|
||||
// istas oso osa osos osas amiento amientos imiento imientos
|
||||
// delete if in R2
|
||||
if ( ($position = $this->search(array(
|
||||
'imientos', 'imiento', 'amientos', 'amiento', 'osas', 'osos', 'osa', 'oso', 'istas', 'ista', 'ibles',
|
||||
'ible', 'ables', 'able', 'ismos', 'ismo', 'icas', 'icos', 'ica', 'ico', 'anzas', 'anza'))) != false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// adora ador ación adoras adores aciones ante antes ancia ancias
|
||||
// delete if in R2
|
||||
// if preceded by ic, delete if in R2
|
||||
if ( ($position = $this->search(array(
|
||||
'adoras', 'adora', 'aciones', 'ación', 'adores', 'ador', 'antes', 'ante', 'ancias', 'ancia'))) != false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->searchIfInR2(array('ic')))) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// logía logías
|
||||
// replace with log if in R2
|
||||
if ( ($position = $this->search(array('logías', 'logía'))) != false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(logías|logía)$#u', 'log', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ución uciones
|
||||
// replace with u if in R2
|
||||
if ( ($position = $this->search(array('uciones', 'ución'))) != false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(uciones|ución)$#u', 'u', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// encia encias
|
||||
// replace with ente if in R2
|
||||
if ( ($position = $this->search(array('encias', 'encia'))) != false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(encias|encia)$#u', 'ente', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// amente
|
||||
// delete if in R1
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
// if preceded by os, ic or ad, delete if in R2
|
||||
if ( ($position = $this->search(array('amente'))) != false) {
|
||||
|
||||
// delete if in R1
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position3);
|
||||
}
|
||||
|
||||
// if preceded by os, ic or ad, delete if in R2
|
||||
} elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position4);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// mente
|
||||
// delete if in R2
|
||||
// if preceded by ante, able or ible, delete if in R2
|
||||
if ( ($position = $this->search(array('mente'))) != false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by ante, able or ible, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('ante', 'able', 'ible'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// idad idades
|
||||
// delete if in R2
|
||||
// if preceded by abil, ic or iv, delete if in R2
|
||||
if ( ($position = $this->search(array('idades', 'idad'))) != false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by abil, ic or iv, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// iva ivo ivas ivos
|
||||
// delete if in R2
|
||||
// if preceded by at, delete if in R2
|
||||
if ( ($position = $this->search(array('ivas', 'ivos', 'iva', 'ivo'))) != false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by at, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('at'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2a: Verb suffixes beginning y
|
||||
*/
|
||||
private function step2a()
|
||||
{
|
||||
// if found, delete if preceded by u
|
||||
// (Note that the preceding u need not be in RV.)
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'yamos', 'yendo', 'yeron', 'yan', 'yen', 'yais', 'yas', 'yes', 'yo', 'yó', 'ya', 'ye'))) != false) {
|
||||
|
||||
$before = Utf8::substr($this->word, ($position-1), 1);
|
||||
if ( (isset($before)) && ($before == 'u') ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2b: Other verb suffixes
|
||||
* Search for the longest among the following suffixes in RV, and perform the action indicated.
|
||||
*/
|
||||
private function step2b()
|
||||
{
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'iésemos', 'iéramos', 'ábamos', 'iríamos', 'eríamos', 'aríamos', 'áramos', 'ásemos', 'eríais',
|
||||
'aremos', 'eremos', 'iremos', 'asteis', 'ieseis', 'ierais', 'isteis', 'aríais',
|
||||
'irían', 'aréis', 'erían', 'erías', 'eréis', 'iréis', 'irías', 'ieran', 'iesen', 'ieron', 'iendo', 'ieras',
|
||||
'iríais', 'arían', 'arías',
|
||||
'amos', 'imos', 'ados', 'idos', 'irán', 'irás', 'erán', 'erás', 'ería', 'iría', 'íais', 'arán', 'arás', 'aría',
|
||||
'iera', 'iese', 'aste', 'iste', 'aban', 'aran', 'asen', 'aron', 'ando', 'abas', 'adas', 'idas', 'ases', 'aras',
|
||||
'aré', 'erá', 'eré', 'áis', 'ías', 'irá', 'iré', 'aba', 'ían', 'ada', 'ara', 'ase', 'ida', 'ado', 'ido', 'ará',
|
||||
'ad', 'ed', 'id', 'ís', 'ió', 'ar', 'er', 'ir', 'as', 'ía', 'an'
|
||||
))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// en es éis emos
|
||||
// delete, and if preceded by gu delete the u (the gu need not be in RV)
|
||||
if ( ($position = $this->searchIfInRv(array('éis', 'emos', 'en', 'es'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->search(array('gu'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, ($position2+1));
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: residual suffix
|
||||
* Search for the longest among the following suffixes in RV, and perform the action indicated.
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// os a o á í ó
|
||||
// delete if in RV
|
||||
if ( ($position = $this->searchIfInRv(array('os', 'a', 'o', 'á', 'í', 'ó'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// e é
|
||||
// delete if in RV, and if preceded by gu with the u in RV delete the u
|
||||
if ( ($position = $this->searchIfInRv(array('e', 'é'))) != false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->searchIfInRv(array('u'))) != false) {
|
||||
$before = Utf8::substr($this->word, ($position2-1), 1);
|
||||
if ( (isset($before)) && ($before == 'g') ) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* And finally:
|
||||
* Remove acute accents
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
$this->word = Utf8::str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word);
|
||||
}
|
||||
}
|
218
libraries/vendor/wamania/php-stemmer/src/Stem.php
vendored
Normal file
218
libraries/vendor/wamania/php-stemmer/src/Stem.php
vendored
Normal file
|
@ -0,0 +1,218 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
abstract class Stem implements Stemmer
|
||||
{
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y');
|
||||
|
||||
/**
|
||||
* helper, contains stringified list of vowels
|
||||
* @var string
|
||||
*/
|
||||
protected $plainVowels;
|
||||
|
||||
/**
|
||||
* The word we are stemming
|
||||
* @var string
|
||||
*/
|
||||
protected $word;
|
||||
|
||||
/**
|
||||
* The original word, use to check if word has been modified
|
||||
* @var string
|
||||
*/
|
||||
protected $originalWord;
|
||||
|
||||
/**
|
||||
* RV value
|
||||
* @var string
|
||||
*/
|
||||
protected $rv;
|
||||
|
||||
/**
|
||||
* RV index (based on the beginning of the word)
|
||||
* @var integer
|
||||
*/
|
||||
protected $rvIndex;
|
||||
|
||||
/**
|
||||
* R1 value
|
||||
* @var integer
|
||||
*/
|
||||
protected $r1;
|
||||
|
||||
/**
|
||||
* R1 index (based on the beginning of the word)
|
||||
* @var int
|
||||
*/
|
||||
protected $r1Index;
|
||||
|
||||
/**
|
||||
* R2 value
|
||||
* @var integer
|
||||
*/
|
||||
protected $r2;
|
||||
|
||||
/**
|
||||
* R2 index (based on the beginning of the word)
|
||||
* @var int
|
||||
*/
|
||||
protected $r2Index;
|
||||
|
||||
protected function inRv($position)
|
||||
{
|
||||
return ($position >= $this->rvIndex);
|
||||
}
|
||||
|
||||
protected function inR1($position)
|
||||
{
|
||||
return ($position >= $this->r1Index);
|
||||
}
|
||||
|
||||
protected function inR2($position)
|
||||
{
|
||||
return ($position >= $this->r2Index);
|
||||
}
|
||||
|
||||
protected function searchIfInRv($suffixes)
|
||||
{
|
||||
return $this->search($suffixes, $this->rvIndex);
|
||||
}
|
||||
|
||||
protected function searchIfInR1($suffixes)
|
||||
{
|
||||
return $this->search($suffixes, $this->r1Index);
|
||||
}
|
||||
|
||||
protected function searchIfInR2($suffixes)
|
||||
{
|
||||
return $this->search($suffixes, $this->r2Index);
|
||||
}
|
||||
|
||||
protected function search($suffixes, $offset = 0)
|
||||
{
|
||||
$length = Utf8::strlen($this->word);
|
||||
if ($offset > $length) {
|
||||
return false;
|
||||
}
|
||||
foreach ($suffixes as $suffixe) {
|
||||
if ( (($position = Utf8::strrpos($this->word, $suffixe, $offset)) !== false) && ((Utf8::strlen($suffixe)+$position) == $length) ) {
|
||||
return $position;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
|
||||
*/
|
||||
protected function r1()
|
||||
{
|
||||
list($this->r1Index, $this->r1) = $this->rx($this->word);
|
||||
}
|
||||
|
||||
/**
|
||||
* R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel.
|
||||
*/
|
||||
protected function r2()
|
||||
{
|
||||
list($index, $value) = $this->rx($this->r1);
|
||||
|
||||
$this->r2 = $value;
|
||||
$this->r2Index = $this->r1Index + $index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Common function for R1 and R2
|
||||
* Search the region after the first non-vowel following a vowel in $word, or the end of the word if there is no such non-vowel.
|
||||
* R1 : $in = $this->word
|
||||
* R2 : $in = R1
|
||||
*/
|
||||
protected function rx($in)
|
||||
{
|
||||
$length = Utf8::strlen($in);
|
||||
|
||||
// defaults
|
||||
$value = '';
|
||||
$index = $length;
|
||||
|
||||
// we search all vowels
|
||||
$vowels = array();
|
||||
for ($i=0; $i<$length; $i++) {
|
||||
$letter = Utf8::substr($in, $i, 1);
|
||||
if (in_array($letter, static::$vowels)) {
|
||||
$vowels[] = $i;
|
||||
}
|
||||
}
|
||||
|
||||
// search the non-vowel following a vowel
|
||||
foreach ($vowels as $position) {
|
||||
$after = $position + 1;
|
||||
$letter = Utf8::substr($in, $after, 1);
|
||||
|
||||
if (! in_array($letter, static::$vowels)) {
|
||||
$index = $after + 1;
|
||||
$value = Utf8::substr($in, ($after+1));
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return array($index, $value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Used by spanish, italian, portuguese, etc (but not by french)
|
||||
*
|
||||
* If the second letter is a consonant, RV is the region after the next following vowel,
|
||||
* or if the first two letters are vowels, RV is the region after the next consonant,
|
||||
* and otherwise (consonant-vowel case) RV is the region after the third letter.
|
||||
* But RV is the end of the word if these positions cannot be found.
|
||||
*/
|
||||
protected function rv()
|
||||
{
|
||||
$length = Utf8::strlen($this->word);
|
||||
|
||||
$this->rv = '';
|
||||
$this->rvIndex = $length;
|
||||
|
||||
if ($length < 3) {
|
||||
return true;
|
||||
}
|
||||
|
||||
$first = Utf8::substr($this->word, 0, 1);
|
||||
$second = Utf8::substr($this->word, 1, 1);
|
||||
|
||||
// If the second letter is a consonant, RV is the region after the next following vowel,
|
||||
if (!in_array($second, static::$vowels)) {
|
||||
for ($i=2; $i<$length; $i++) {
|
||||
$letter = Utf8::substr($this->word, $i, 1);
|
||||
if (in_array($letter, static::$vowels)) {
|
||||
$this->rvIndex = $i + 1;
|
||||
$this->rv = Utf8::substr($this->word, ($i+1));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// or if the first two letters are vowels, RV is the region after the next consonant,
|
||||
if ( (in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) {
|
||||
for ($i=2; $i<$length; $i++) {
|
||||
$letter = Utf8::substr($this->word, $i, 1);
|
||||
if (! in_array($letter, static::$vowels)) {
|
||||
$this->rvIndex = $i + 1;
|
||||
$this->rv = Utf8::substr($this->word, ($i+1));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// and otherwise (consonant-vowel case) RV is the region after the third letter.
|
||||
if ( (! in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) {
|
||||
$this->rv = Utf8::substr($this->word, 3);
|
||||
$this->rvIndex = 3;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
19
libraries/vendor/wamania/php-stemmer/src/Stemmer.php
vendored
Normal file
19
libraries/vendor/wamania/php-stemmer/src/Stemmer.php
vendored
Normal file
|
@ -0,0 +1,19 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
* @author Luís Cobucci <lcobucci@gmail.com>
|
||||
*/
|
||||
interface Stemmer
|
||||
{
|
||||
/**
|
||||
* Main function to get the STEM of a word
|
||||
*
|
||||
* @param string $word A valid UTF-8 word
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @throws \Exception
|
||||
*/
|
||||
public function stem($word);
|
||||
}
|
127
libraries/vendor/wamania/php-stemmer/src/Swedish.php
vendored
Normal file
127
libraries/vendor/wamania/php-stemmer/src/Swedish.php
vendored
Normal file
|
@ -0,0 +1,127 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/swedish/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Swedish extends Stem
|
||||
{
|
||||
/**
|
||||
* All swedish vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'ä', 'å', 'ö');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! Utf8::check($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
// R2 is not used: R1 is defined in the same way as in the German stemmer
|
||||
$this->r1();
|
||||
|
||||
// then R1 is adjusted so that the region before it contains at least 3 letters.
|
||||
if ($this->r1Index < 3) {
|
||||
$this->r1Index = 3;
|
||||
$this->r1 = Utf8::substr($this->word, 3);
|
||||
}
|
||||
|
||||
// Do each of steps 1, 2 3 and 4.
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a valid s-ending as one of
|
||||
* b c d f g h j k l m n o p r t v y
|
||||
*
|
||||
* @param string $ending
|
||||
* @return boolean
|
||||
*/
|
||||
private function hasValidSEnding($word)
|
||||
{
|
||||
$lastLetter = Utf8::substr($word, -1, 1);
|
||||
return in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
* Search for the longest among the following suffixes in R1, and perform the action indicated.
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// a arna erna heterna orna ad e ade ande arne are aste en anden aren heten
|
||||
// ern ar er heter or as arnas ernas ornas es ades andes ens arens hetens
|
||||
// erns at andet het ast
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInR1(array(
|
||||
'heterna', 'hetens', 'ornas', 'andes', 'arnas', 'heter', 'ernas', 'anden', 'heten', 'andet', 'arens',
|
||||
'orna', 'arna', 'erna', 'aren', 'ande', 'ades', 'arne', 'erns', 'aste', 'ade', 'ern', 'het',
|
||||
'ast', 'are', 'ens', 'or', 'es', 'ad', 'en', 'at', 'ar', 'as', 'er', 'a', 'e'
|
||||
))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// s
|
||||
// delete if preceded by a valid s-ending
|
||||
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
|
||||
$word = Utf8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidSEnding($word)) {
|
||||
$this->word = $word;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
* Search for one of the following suffixes in R1, and if found delete the last letter.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
// dd gd nn dt gt kt tt
|
||||
if ($this->searchIfInR1(array('dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt')) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3:
|
||||
* Search for the longest among the following suffixes in R1, and perform the action indicated.
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// lig ig els
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInR1(array('lig', 'ig', 'els'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// löst
|
||||
// replace with lös
|
||||
if ( ($this->searchIfInR1(array('löst'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
return true;
|
||||
}
|
||||
|
||||
// fullt
|
||||
// replace with full
|
||||
if ( ($this->searchIfInR1(array('fullt'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
708
libraries/vendor/wamania/php-stemmer/src/Utf8.php
vendored
Normal file
708
libraries/vendor/wamania/php-stemmer/src/Utf8.php
vendored
Normal file
|
@ -0,0 +1,708 @@
|
|||
<?php
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
/**
|
||||
* UTF8 helper functions
|
||||
*
|
||||
* @license LGPL (http://www.gnu.org/copyleft/lesser.html)
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @package Stato
|
||||
* @subpackage view
|
||||
*/
|
||||
|
||||
class Utf8
|
||||
{
|
||||
/**
|
||||
* UTF-8 lookup table for lower case accented letters
|
||||
*
|
||||
* This lookuptable defines replacements for accented characters from the ASCII-7
|
||||
* range. This are lower case letters only.
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see utf8_deaccent()
|
||||
*/
|
||||
private static $utf8_lower_accents = array(
|
||||
'à' => 'a', 'ô' => 'o', 'd' => 'd', '?' => 'f', 'ë' => 'e', 'š' => 's', 'o' => 'o',
|
||||
'ß' => 'ss', 'a' => 'a', 'r' => 'r', '?' => 't', 'n' => 'n', 'a' => 'a', 'k' => 'k',
|
||||
's' => 's', '?' => 'y', 'n' => 'n', 'l' => 'l', 'h' => 'h', '?' => 'p', 'ó' => 'o',
|
||||
'ú' => 'u', 'e' => 'e', 'é' => 'e', 'ç' => 'c', '?' => 'w', 'c' => 'c', 'õ' => 'o',
|
||||
'?' => 's', 'ø' => 'o', 'g' => 'g', 't' => 't', '?' => 's', 'e' => 'e', 'c' => 'c',
|
||||
's' => 's', 'î' => 'i', 'u' => 'u', 'c' => 'c', 'e' => 'e', 'w' => 'w', '?' => 't',
|
||||
'u' => 'u', 'c' => 'c', 'ö' => 'oe', 'è' => 'e', 'y' => 'y', 'a' => 'a', 'l' => 'l',
|
||||
'u' => 'u', 'u' => 'u', 's' => 's', 'g' => 'g', 'l' => 'l', 'ƒ' => 'f', 'ž' => 'z',
|
||||
'?' => 'w', '?' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', '?' => 'd', 't' => 't',
|
||||
'r' => 'r', 'ä' => 'ae', 'í' => 'i', 'r' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
|
||||
'e' => 'e', 'ñ' => 'n', 'n' => 'n', 'h' => 'h', 'g' => 'g', 'd' => 'd', 'j' => 'j',
|
||||
'ÿ' => 'y', 'u' => 'u', 'u' => 'u', 'u' => 'u', 't' => 't', 'ý' => 'y', 'o' => 'o',
|
||||
'â' => 'a', 'l' => 'l', '?' => 'w', 'z' => 'z', 'i' => 'i', 'ã' => 'a', 'g' => 'g',
|
||||
'?' => 'm', 'o' => 'o', 'i' => 'i', 'ù' => 'u', 'i' => 'i', 'z' => 'z', 'á' => 'a',
|
||||
'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
|
||||
);
|
||||
|
||||
/**
|
||||
* UTF-8 Case lookup table
|
||||
*
|
||||
* This lookuptable defines the upper case letters to their correspponding
|
||||
* lower case letter in UTF-8
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
*/
|
||||
private static $utf8_lower_to_upper = array(
|
||||
0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
|
||||
0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
|
||||
0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
|
||||
0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
|
||||
0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
|
||||
0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
|
||||
0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
|
||||
0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
|
||||
0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
|
||||
0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
|
||||
0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
|
||||
0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
|
||||
0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
|
||||
0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
|
||||
0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
|
||||
0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
|
||||
0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
|
||||
0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
|
||||
0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
|
||||
0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
|
||||
0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
|
||||
0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
|
||||
0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
|
||||
0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
|
||||
0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
|
||||
0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
|
||||
0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
|
||||
0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
|
||||
0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
|
||||
0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
|
||||
0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
|
||||
0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
|
||||
0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
|
||||
0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
|
||||
0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
|
||||
0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
|
||||
0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
|
||||
0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
|
||||
0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
|
||||
0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
|
||||
0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
|
||||
0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
|
||||
0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
|
||||
);
|
||||
|
||||
/**
|
||||
* UTF-8 Case lookup table
|
||||
*
|
||||
* This lookuptable defines the lower case letters to their correspponding
|
||||
* upper case letter in UTF-8 (it does so by flipping $utf8_lower_to_upper)
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
*/
|
||||
//private static $utf8_upper_to_lower = array_flip(self::$utf8_lower_to_upper);
|
||||
|
||||
|
||||
/**
|
||||
* UTF-8 lookup table for upper case accented letters
|
||||
*
|
||||
* This lookuptable defines replacements for accented characters from the ASCII-7
|
||||
* range. This are upper case letters only.
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see utf8_deaccent()
|
||||
*/
|
||||
private static $utf8_upper_accents = array(
|
||||
'À' => 'A', 'Ô' => 'O', 'D' => 'D', '?' => 'F', 'Ë' => 'E', 'Š' => 'S', 'O' => 'O',
|
||||
'A' => 'A', 'R' => 'R', '?' => 'T', 'N' => 'N', 'A' => 'A', 'K' => 'K',
|
||||
'S' => 'S', '?' => 'Y', 'N' => 'N', 'L' => 'L', 'H' => 'H', '?' => 'P', 'Ó' => 'O',
|
||||
'Ú' => 'U', 'E' => 'E', 'É' => 'E', 'Ç' => 'C', '?' => 'W', 'C' => 'C', 'Õ' => 'O',
|
||||
'?' => 'S', 'Ø' => 'O', 'G' => 'G', 'T' => 'T', '?' => 'S', 'E' => 'E', 'C' => 'C',
|
||||
'S' => 'S', 'Î' => 'I', 'U' => 'U', 'C' => 'C', 'E' => 'E', 'W' => 'W', '?' => 'T',
|
||||
'U' => 'U', 'C' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Y' => 'Y', 'A' => 'A', 'L' => 'L',
|
||||
'U' => 'U', 'U' => 'U', 'S' => 'S', 'G' => 'G', 'L' => 'L', 'ƒ' => 'F', 'Ž' => 'Z',
|
||||
'?' => 'W', '?' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', '?' => 'D', 'T' => 'T',
|
||||
'R' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'R' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
|
||||
'E' => 'E', 'Ñ' => 'N', 'N' => 'N', 'H' => 'H', 'G' => 'G', 'Ð' => 'D', 'J' => 'J',
|
||||
'Ÿ' => 'Y', 'U' => 'U', 'U' => 'U', 'U' => 'U', 'T' => 'T', 'Ý' => 'Y', 'O' => 'O',
|
||||
'Â' => 'A', 'L' => 'L', '?' => 'W', 'Z' => 'Z', 'I' => 'I', 'Ã' => 'A', 'G' => 'G',
|
||||
'?' => 'M', 'O' => 'O', 'I' => 'I', 'Ù' => 'U', 'I' => 'I', 'Z' => 'Z', 'Á' => 'A',
|
||||
'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
|
||||
);
|
||||
|
||||
/**
|
||||
* UTF-8 array of common special characters
|
||||
*
|
||||
* This array should contain all special characters (not a letter or digit)
|
||||
* defined in the various local charsets - it's not a complete list of non-alphanum
|
||||
* characters in UTF-8. It's not perfect but should match most cases of special
|
||||
* chars.
|
||||
*
|
||||
* The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
|
||||
* These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see utf8_stripspecials()
|
||||
*/
|
||||
private static $utf8_special_chars = array(
|
||||
0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
|
||||
0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c,
|
||||
0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
|
||||
0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
|
||||
0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
|
||||
0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
|
||||
0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
|
||||
0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
|
||||
0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
|
||||
0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
|
||||
0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
|
||||
0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
|
||||
0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
|
||||
0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
|
||||
0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
|
||||
0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
|
||||
0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
|
||||
0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
|
||||
0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
|
||||
0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
|
||||
0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
|
||||
0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
|
||||
0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
|
||||
0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
|
||||
0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
|
||||
0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
|
||||
0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
|
||||
0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
|
||||
0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
|
||||
0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
|
||||
0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
|
||||
0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
|
||||
0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
|
||||
0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
|
||||
0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
|
||||
0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
|
||||
0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
|
||||
0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
|
||||
0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
|
||||
0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
|
||||
0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
|
||||
0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
|
||||
0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
|
||||
0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
|
||||
0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
|
||||
0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
|
||||
0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
|
||||
0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
|
||||
0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
|
||||
0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
|
||||
);
|
||||
|
||||
/**
|
||||
* URL-Encode a filename to allow unicodecharacters
|
||||
*
|
||||
* Slashes are not encoded
|
||||
*
|
||||
* When the second parameter is true the string will
|
||||
* be encoded only if non ASCII characters are detected -
|
||||
* This makes it safe to run it multiple times on the
|
||||
* same string (default is true)
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see urlencode
|
||||
*/
|
||||
public static function encode_fn($file,$safe=true)
|
||||
{
|
||||
if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
|
||||
return $file;
|
||||
}
|
||||
$file = urlencode($file);
|
||||
$file = str_replace('%2F','/',$file);
|
||||
return $file;
|
||||
}
|
||||
|
||||
/**
|
||||
* URL-Decode a filename
|
||||
*
|
||||
* This is just a wrapper around urldecode
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see urldecode
|
||||
*/
|
||||
public static function decode_fn($file)
|
||||
{
|
||||
$file = urldecode($file);
|
||||
return $file;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a string contains 7bit ASCII only
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
*/
|
||||
public static function is_ascii($str)
|
||||
{
|
||||
for($i=0; $i<strlen($str); $i++){
|
||||
if(ord($str{$i}) >127) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Strips all highbyte chars
|
||||
*
|
||||
* Returns a pure ASCII7 string
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
*/
|
||||
public static function strip($str)
|
||||
{
|
||||
$ascii = '';
|
||||
for($i=0; $i<strlen($str); $i++){
|
||||
if(ord($str{$i}) <128){
|
||||
$ascii .= $str{$i};
|
||||
}
|
||||
}
|
||||
return $ascii;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to detect if a string is in Unicode encoding
|
||||
*
|
||||
* @author <bmorel@ssi.fr>
|
||||
* @link http://www.php.net/manual/en/function.utf8-encode.php
|
||||
*/
|
||||
public static function check($str)
|
||||
{
|
||||
for ($i=0; $i<strlen($str); $i++) {
|
||||
if (ord($str[$i]) < 0x80) continue; # 0bbbbbbb
|
||||
elseif ((ord($str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
|
||||
elseif ((ord($str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
|
||||
elseif ((ord($str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
|
||||
elseif ((ord($str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
|
||||
elseif ((ord($str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
|
||||
else return false; # Does not match any model
|
||||
for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
|
||||
if ((++$i == strlen($str)) || ((ord($str[$i]) & 0xC0) != 0x80))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Unicode aware replacement for strlen()
|
||||
*
|
||||
* utf8_decode() converts characters that are not in ISO-8859-1
|
||||
* to '?', which, for the purpose of counting, is alright - It's
|
||||
* even faster than mb_strlen.
|
||||
*
|
||||
* @author <chernyshevsky at hotmail dot com>
|
||||
* @see strlen()
|
||||
* @see utf8_decode()
|
||||
*/
|
||||
public static function strlen($string)
|
||||
{
|
||||
return strlen(utf8_decode($string));
|
||||
}
|
||||
|
||||
/**
|
||||
* Unicode aware replacement for substr()
|
||||
*
|
||||
* @author lmak at NOSPAM dot iti dot gr
|
||||
* @link http://www.php.net/manual/en/function.substr.php
|
||||
* @see substr()
|
||||
*/
|
||||
public static function substr($str,$start,$length=null)
|
||||
{
|
||||
$ar = array();
|
||||
preg_match_all("/./u", $str, $ar);
|
||||
|
||||
if($length != null) {
|
||||
return join("",array_slice($ar[0],$start,$length));
|
||||
} else {
|
||||
return join("",array_slice($ar[0],$start));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unicode aware replacement for substr_replace()
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see substr_replace()
|
||||
*/
|
||||
public static function substr_replace($string, $replacement, $start , $length=null )
|
||||
{
|
||||
$ret = '';
|
||||
if($start>0) $ret .= self::substr($string, 0, $start);
|
||||
$ret .= $replacement;
|
||||
if($length!=null) $ret .= self::substr($string, $start+$length);
|
||||
return $ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Unicode aware replacement for explode
|
||||
*
|
||||
* @TODO support third limit arg
|
||||
* @author Harry Fuecks <hfuecks@gmail.com>
|
||||
* @see explode();
|
||||
*/
|
||||
public static function explode($sep, $str)
|
||||
{
|
||||
if ( $sep == '' ) {
|
||||
trigger_error('Empty delimiter',E_USER_WARNING);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return preg_split('!'.preg_quote($sep,'!').'!u',$str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unicode aware replacement for strrepalce()
|
||||
*
|
||||
* @todo support PHP5 count (fourth arg)
|
||||
* @author Harry Fuecks <hfuecks@gmail.com>
|
||||
* @see strreplace();
|
||||
*/
|
||||
public static function str_replace($s,$r,$str)
|
||||
{
|
||||
if(!is_array($s)){
|
||||
$s = '!'.preg_quote($s,'!').'!u';
|
||||
}else{
|
||||
foreach ($s as $k => $v) {
|
||||
$s[$k] = '!'.preg_quote($v).'!u';
|
||||
}
|
||||
}
|
||||
return preg_replace($s,$r,$str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unicode aware replacement for ltrim()
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see ltrim()
|
||||
* @return string
|
||||
*/
|
||||
public static function ltrim($str,$charlist='')
|
||||
{
|
||||
if($charlist == '') return ltrim($str);
|
||||
|
||||
//quote charlist for use in a characterclass
|
||||
$charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
|
||||
|
||||
return preg_replace('/^['.$charlist.']+/u','',$str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unicode aware replacement for rtrim()
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see rtrim()
|
||||
* @return string
|
||||
*/
|
||||
public static function rtrim($str,$charlist='')
|
||||
{
|
||||
if($charlist == '') return rtrim($str);
|
||||
|
||||
//quote charlist for use in a characterclass
|
||||
$charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
|
||||
|
||||
return preg_replace('/['.$charlist.']+$/u','',$str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unicode aware replacement for trim()
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see trim()
|
||||
* @return string
|
||||
*/
|
||||
public static function trim($str,$charlist='')
|
||||
{
|
||||
if($charlist == '') return trim($str);
|
||||
|
||||
return self::ltrim(self::rtrim($str));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This is a unicode aware replacement for strtolower()
|
||||
*
|
||||
* Uses mb_string extension if available
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see strtolower()
|
||||
* @see utf8_strtoupper()
|
||||
*/
|
||||
public static function strtolower($string)
|
||||
{
|
||||
if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
|
||||
return mb_strtolower($string,'utf-8');
|
||||
|
||||
//global $utf8_upper_to_lower;
|
||||
$utf8_upper_to_lower = array_flip(self::$utf8_lower_to_upper);
|
||||
$uni = self::utf8_to_unicode($string);
|
||||
$cnt = count($uni);
|
||||
for ($i=0; $i < $cnt; $i++){
|
||||
if($utf8_upper_to_lower[$uni[$i]]){
|
||||
$uni[$i] = $utf8_upper_to_lower[$uni[$i]];
|
||||
}
|
||||
}
|
||||
return self::unicode_to_utf8($uni);
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a unicode aware replacement for strtoupper()
|
||||
*
|
||||
* Uses mb_string extension if available
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @see strtoupper()
|
||||
* @see utf8_strtoupper()
|
||||
*/
|
||||
public static function strtoupper($string)
|
||||
{
|
||||
if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
|
||||
return mb_strtoupper($string,'utf-8');
|
||||
|
||||
//global $utf8_lower_to_upper;
|
||||
$uni = self::utf8_to_unicode($string);
|
||||
$cnt = count($uni);
|
||||
for ($i=0; $i < $cnt; $i++){
|
||||
if(self::$utf8_lower_to_upper[$uni[$i]]){
|
||||
$uni[$i] = self::$utf8_lower_to_upper[$uni[$i]];
|
||||
}
|
||||
}
|
||||
return self::unicode_to_utf8($uni);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
|
||||
*
|
||||
* Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
|
||||
* letters. Default is to deaccent both cases ($case = 0)
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
*/
|
||||
public static function deaccent($string,$case=0)
|
||||
{
|
||||
if($case <= 0){
|
||||
//global $utf8_lower_accents;
|
||||
$string = str_replace(array_keys(self::$utf8_lower_accents),array_values(self::$utf8_lower_accents),$string);
|
||||
}
|
||||
if($case >= 0){
|
||||
//global $utf8_upper_accents;
|
||||
$string = str_replace(array_keys(self::$utf8_upper_accents),array_values(self::$utf8_upper_accents),$string);
|
||||
}
|
||||
return $string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes special characters (nonalphanumeric) from a UTF-8 string
|
||||
*
|
||||
* This function adds the controlchars 0x00 to 0x19 to the array of
|
||||
* stripped chars (they are not included in $utf8_special_chars)
|
||||
*
|
||||
* @author Andreas Gohr <andi@splitbrain.org>
|
||||
* @param string $string The UTF8 string to strip of special chars
|
||||
* @param string $repl Replace special with this string
|
||||
* @param string $additional Additional chars to strip (used in regexp char class)
|
||||
*/
|
||||
public static function stripspecials($string,$repl='',$additional='')
|
||||
{
|
||||
//global $utf8_special_chars;
|
||||
|
||||
static $specials = null;
|
||||
if(is_null($specials)){
|
||||
$specials = preg_quote(self::unicode_to_utf8(self::$utf8_special_chars), '/');
|
||||
}
|
||||
|
||||
return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
|
||||
}
|
||||
|
||||
/**
|
||||
* This is an Unicode aware replacement for strpos
|
||||
*
|
||||
* Uses mb_string extension if available
|
||||
*
|
||||
* @author Harry Fuecks <hfuecks@gmail.com>
|
||||
* @see strpos()
|
||||
*/
|
||||
public static function strpos($haystack, $needle, $offset=0)
|
||||
{
|
||||
if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
|
||||
return mb_strpos($haystack,$needle,$offset,'utf-8');
|
||||
|
||||
if(!$offset){
|
||||
$ar = self::explode($needle, $haystack);
|
||||
if ( count($ar) > 1 ) {
|
||||
return self::strlen($ar[0]);
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
if ( !is_int($offset) ) {
|
||||
trigger_error('Offset must be an integer',E_USER_WARNING);
|
||||
return false;
|
||||
}
|
||||
|
||||
$str = self::substr($haystack, $offset);
|
||||
|
||||
if ( false !== ($pos = self::strpos($str, $needle))){
|
||||
return $pos + $offset;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This is an Unicode aware replacement for strrpos
|
||||
*
|
||||
* Uses mb_string extension if available
|
||||
*
|
||||
* @author Harry Fuecks <hfuecks@gmail.com>
|
||||
* @see strpos()
|
||||
*/
|
||||
public static function strrpos($haystack, $needle, $offset=0)
|
||||
{
|
||||
if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strrpos'))
|
||||
return mb_strrpos($haystack, $needle, $offset, 'utf-8');
|
||||
|
||||
if (!$offset) {
|
||||
$ar = self::explode($needle, $haystack);
|
||||
$count = count($ar);
|
||||
if ( $count > 1 ) {
|
||||
return self::strlen($haystack) - self::strlen($ar[($count-1)]) - self::strlen($needle);
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
if ( !is_int($offset) ) {
|
||||
trigger_error('Offset must be an integer', E_USER_WARNING);
|
||||
return false;
|
||||
}
|
||||
|
||||
$str = self::substr($haystack, $offset);
|
||||
|
||||
if ( false !== ($pos = self::strrpos($str, $needle))){
|
||||
return $pos + $offset;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes UTF-8 characters to HTML entities
|
||||
*
|
||||
* @author <vpribish at shopping dot com>
|
||||
* @link http://www.php.net/manual/en/function.utf8-decode.php
|
||||
*/
|
||||
public static function tohtml ($str)
|
||||
{
|
||||
$ret = '';
|
||||
$max = strlen($str);
|
||||
$last = 0; // keeps the index of the last regular character
|
||||
for ($i=0; $i<$max; $i++) {
|
||||
$c = $str{$i};
|
||||
$c1 = ord($c);
|
||||
if ($c1>>5 == 6) { // 110x xxxx, 110 prefix for 2 bytes unicode
|
||||
$ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
|
||||
$c1 &= 31; // remove the 3 bit two bytes prefix
|
||||
$c2 = ord($str{++$i}); // the next byte
|
||||
$c2 &= 63; // remove the 2 bit trailing byte prefix
|
||||
$c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
|
||||
$c1 >>= 2; // c1 shifts 2 to the right
|
||||
$ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
|
||||
$last = $i+1;
|
||||
}
|
||||
}
|
||||
return $ret . substr($str, $last, $i); // append the last batch of regular characters
|
||||
}
|
||||
|
||||
/**
|
||||
* This function returns any UTF-8 encoded text as a list of
|
||||
* Unicode values:
|
||||
*
|
||||
* @author Scott Michael Reynen <scott@randomchaos.com>
|
||||
* @link http://www.randomchaos.com/document.php?source=php_and_unicode
|
||||
* @see unicode_to_utf8()
|
||||
*/
|
||||
public static function utf8_to_unicode( &$str )
|
||||
{
|
||||
$unicode = array();
|
||||
$values = array();
|
||||
$looking_for = 1;
|
||||
|
||||
for ($i = 0; $i < strlen( $str ); $i++ ) {
|
||||
$this_value = ord( $str[ $i ] );
|
||||
if ( $this_value < 128 ) $unicode[] = $this_value;
|
||||
else {
|
||||
if ( count( $values ) == 0 ) $looking_for = ( $this_value < 224 ) ? 2 : 3;
|
||||
$values[] = $this_value;
|
||||
if ( count( $values ) == $looking_for ) {
|
||||
$number = ( $looking_for == 3 ) ?
|
||||
( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
|
||||
( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
|
||||
$unicode[] = $number;
|
||||
$values = array();
|
||||
$looking_for = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $unicode;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function converts a Unicode array back to its UTF-8 representation
|
||||
*
|
||||
* @author Scott Michael Reynen <scott@randomchaos.com>
|
||||
* @link http://www.randomchaos.com/document.php?source=php_and_unicode
|
||||
* @see utf8_to_unicode()
|
||||
*/
|
||||
public static function unicode_to_utf8( &$str )
|
||||
{
|
||||
if (!is_array($str)) return '';
|
||||
|
||||
$utf8 = '';
|
||||
foreach( $str as $unicode ) {
|
||||
if ( $unicode < 128 ) {
|
||||
$utf8.= chr( $unicode );
|
||||
} elseif ( $unicode < 2048 ) {
|
||||
$utf8.= chr( 192 + ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
|
||||
$utf8.= chr( 128 + ( $unicode % 64 ) );
|
||||
} else {
|
||||
$utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
|
||||
$utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
|
||||
$utf8.= chr( 128 + ( $unicode % 64 ) );
|
||||
}
|
||||
}
|
||||
return $utf8;
|
||||
}
|
||||
|
||||
/**
|
||||
* UTF-8 to UTF-16BE conversion.
|
||||
*
|
||||
* Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
|
||||
*/
|
||||
public static function utf8_to_utf16be(&$str, $bom = false)
|
||||
{
|
||||
$out = $bom ? "\xFE\xFF" : '';
|
||||
if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding'))
|
||||
return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
|
||||
|
||||
$uni = self::utf8_to_unicode($str);
|
||||
foreach($uni as $cp){
|
||||
$out .= pack('n',$cp);
|
||||
}
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* UTF-8 to UTF-16BE conversion.
|
||||
*
|
||||
* Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
|
||||
*/
|
||||
public static function utf16be_to_utf8(&$str)
|
||||
{
|
||||
$uni = unpack('n*',$str);
|
||||
return self::unicode_to_utf8($uni);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user