Firstly, set the encoding used for mb_* functions and mb_* regular expressions (the second might be redundant, but might as well add it to be sure):
mb_internal_encoding('UTF-8');
mb_regex_encoding('UTF-8');
If you're just matching a string within a string, mb_stripos works (the 'i' stand for 'case-insensitive' - use mb_strpos otherwise):
function _strpos_all($haystack, $needle){
$offset = 0;
$positions = array();
$length = mb_strlen($needle); // length in chars
do {
$i = mb_stripos($haystack, $needle, $offset);
if ($i === FALSE)
break;
$positions[] = $i; // mb_stripos returns the offset in chars
$offset = $i + $length;
} while (1);
return $positions;
}
If you need to match a regular expression, use mb_ereg_search:
function _ereg_match_all($haystack, $needle){
$offset = 0;
$positions = array();
$length = strlen($needle); // length in bytes
$total = strlen($haystack); // length in bytes
$pattern = sprintf('\b%s\b', preg_quote($needle, '/'));
mb_ereg_search_init($haystack, $pattern, 'msi'); // initialise the search subject and pattern
do {
mb_ereg_search_setpos($offset); // set the offset from which the search will start
$results = mb_ereg_search_pos(); // find the offset of the next occurrence
if (empty($results))
break;
$i = $results[0]; // mb_ereg_search_pos returns the offset in bytes
$sub = mb_strcut($haystack, 0, $i); // mb_strcut cuts at an offset counted in bytes
$positions[] = mb_strlen($sub); // mb_strlen returns the length of a string in chars
$offset = $i + $length; // offset doesn't include matched \b?
} while ($offset
Or use preg_match with the 'u' modifier:
function _preg_match_all($haystack, $needle){
$offset = 0;
$positions = array();
$length = strlen($needle); // length in bytes
$pattern = sprintf('/\b%s\b/iu', preg_quote($needle, '/'));
do {
$count = preg_match($pattern, $haystack, $matches, PREG_OFFSET_CAPTURE, $offset); // find the offset of the next occurrence
if (!$count)
break;
$i = $matches[0][1]; // PREG_OFFSET_CAPTURE returns the offset in bytes
$sub = mb_strcut($haystack, 0, $i); // mb_strcut cuts at an offset counted in bytes
$positions[] = mb_strlen($sub); // mb_strlen returns the length of a string in chars
$offset = $i + $length; // offset doesn't include matched \b?
} while (1);
return $positions;
}
Or use preg_match_all with the 'u' modifier:
function _preg_match_all($haystack, $needle){
$pattern = sprintf('/\b%s\b/iu', preg_quote($needle, '/'));
preg_match_all($pattern, $haystack, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER); // PREG_OFFSET_CAPTURE returns byte offset, not chars, even with the 'u' modifier
$positions = array();
foreach ($matches as $match)
$positions[] = mb_strlen(mb_strcut($haystack, 0, $match[0][1])); // convert bytes to chars
return $positions;
}