29
0
mirror of https://github.com/joomla/joomla-cms.git synced 2024-08-03 17:50:26 +00:00

[4.3] Finder: Fixing tokenisation for chinese text (#41275)

This commit is contained in:
Hannes Papenberg 2023-08-07 21:58:47 +02:00 committed by GitHub
parent de6a50ebf9
commit fafb2756f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -11,7 +11,6 @@
namespace Joomla\Component\Finder\Administrator\Indexer\Language;
use Joomla\Component\Finder\Administrator\Indexer\Language;
use Joomla\String\StringHelper;
// phpcs:disable PSR1.Files.SideEffects
\defined('_JEXEC') or die;
@ -61,27 +60,12 @@ class Zh extends Language
*/
public function tokenise($input)
{
// We first add whitespace around each Chinese character, so that our later code can easily split on this.
$input = preg_replace('#\p{Han}#mui', ' $0 ', $input);
// Now we split up the input into individual terms
$terms = parent::tokenise($input);
// Iterate through the terms and test if they contain Chinese.
for ($i = 0, $n = count($terms); $i < $n; $i++) {
$charMatches = [];
$charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
// Split apart any groups of Chinese characters.
for ($j = 0; $j < $charCount; $j++) {
$tSplit = StringHelper::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
if (!empty($tSplit)) {
$terms[$i] = $tSplit;
} else {
unset($terms[$i]);
}
$terms[] = $charMatches[0][$j];
}
}
return $terms;
}
}