Finding all occurrences of a UTF-8-encoded needle in a UTF-8-encoded haystack

·

Firstly, set the encoding used for mb_* functions and mb_* regular expressions (the second might be redundant, but might as well add it to be sure):

mb_internal_encoding('UTF-8');
mb_regex_encoding('UTF-8');

If you're just matching a string within a string, mb_stripos works (the 'i' stand for 'case-insensitive' - use mb_strpos otherwise):

function _strpos_all($haystack, $needle){
  $offset = 0;
  $positions = array();
  $length = mb_strlen($needle); // length in chars
    
  do {
    $i = mb_stripos($haystack, $needle, $offset);
    if ($i === FALSE)
      break;
    
    $positions[] = $i; // mb_stripos returns the offset in chars
    $offset = $i + $length;
  } while (1);
  
  return $positions;
}

If you need to match a regular expression, use mb_ereg_search:

function _ereg_match_all($haystack, $needle){
  $offset = 0;
  $positions = array();
  $length = strlen($needle); // length in bytes
  $total = strlen($haystack); // length in bytes
  $pattern = sprintf('\b%s\b', preg_quote($needle, '/'));
  
  mb_ereg_search_init($haystack, $pattern, 'msi'); // initialise the search subject and pattern
  
  do {
    mb_ereg_search_setpos($offset); // set the offset from which the search will start
    
    $results = mb_ereg_search_pos(); // find the offset of the next occurrence
    if (empty($results))
      break;
      
    $i = $results[0]; // mb_ereg_search_pos returns the offset in bytes
    
    $sub = mb_strcut($haystack, 0, $i); // mb_strcut cuts at an offset counted in bytes
    $positions[] = mb_strlen($sub); // mb_strlen returns the length of a string in chars
    
    $offset = $i + $length; // offset doesn't include matched \b?
  } while ($offset 

Or use preg_match with the 'u' modifier:

function _preg_match_all($haystack, $needle){
  $offset = 0;
  $positions = array();
  $length = strlen($needle); // length in bytes
  $pattern = sprintf('/\b%s\b/iu', preg_quote($needle, '/'));
    
  do {
    $count = preg_match($pattern, $haystack, $matches, PREG_OFFSET_CAPTURE, $offset); // find the offset of the next occurrence
    if (!$count)
      break;
      
    $i = $matches[0][1]; // PREG_OFFSET_CAPTURE returns the offset in bytes
    
    $sub = mb_strcut($haystack, 0, $i); // mb_strcut cuts at an offset counted in bytes
    $positions[] = mb_strlen($sub); // mb_strlen returns the length of a string in chars
    
    $offset = $i + $length; // offset doesn't include matched \b?
  } while (1);
  
  return $positions;
}

Or use preg_match_all with the 'u' modifier:

function _preg_match_all($haystack, $needle){
  $pattern = sprintf('/\b%s\b/iu', preg_quote($needle, '/'));
  preg_match_all($pattern, $haystack, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER);  // PREG_OFFSET_CAPTURE returns byte offset, not chars, even with the 'u' modifier
  
  $positions = array();
  foreach ($matches as $match)
    $positions[] = mb_strlen(mb_strcut($haystack, 0, $match[0][1]));  // convert bytes to chars
  return $positions;
}