mirror of
https://github.com/joomla/joomla-cms.git
synced 2024-08-03 17:50:26 +00:00
[4.3] Finder: Fixing tokenisation for chinese text (#41275)
This commit is contained in:
parent
de6a50ebf9
commit
fafb2756f7
@ -11,7 +11,6 @@
|
||||
namespace Joomla\Component\Finder\Administrator\Indexer\Language;
|
||||
|
||||
use Joomla\Component\Finder\Administrator\Indexer\Language;
|
||||
use Joomla\String\StringHelper;
|
||||
|
||||
// phpcs:disable PSR1.Files.SideEffects
|
||||
\defined('_JEXEC') or die;
|
||||
@ -61,27 +60,12 @@ class Zh extends Language
|
||||
*/
|
||||
public function tokenise($input)
|
||||
{
|
||||
// We first add whitespace around each Chinese character, so that our later code can easily split on this.
|
||||
$input = preg_replace('#\p{Han}#mui', ' $0 ', $input);
|
||||
|
||||
// Now we split up the input into individual terms
|
||||
$terms = parent::tokenise($input);
|
||||
|
||||
// Iterate through the terms and test if they contain Chinese.
|
||||
for ($i = 0, $n = count($terms); $i < $n; $i++) {
|
||||
$charMatches = [];
|
||||
$charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
|
||||
|
||||
// Split apart any groups of Chinese characters.
|
||||
for ($j = 0; $j < $charCount; $j++) {
|
||||
$tSplit = StringHelper::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
|
||||
|
||||
if (!empty($tSplit)) {
|
||||
$terms[$i] = $tSplit;
|
||||
} else {
|
||||
unset($terms[$i]);
|
||||
}
|
||||
|
||||
$terms[] = $charMatches[0][$j];
|
||||
}
|
||||
}
|
||||
|
||||
return $terms;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user