PHP glob directory UTF-8

Question 1

As case you can use:

<?php
  class Encoding {

    protected static $win1252ToUtf8 = array(
      128 => "\xe2\x82\xac",

      130 => "\xe2\x80\x9a",
      131 => "\xc6\x92",
      132 => "\xe2\x80\x9e",
      133 => "\xe2\x80\xa6",
      134 => "\xe2\x80\xa0",
      135 => "\xe2\x80\xa1",
      136 => "\xcb\x86",
      137 => "\xe2\x80\xb0",
      138 => "\xc5\xa0",
      139 => "\xe2\x80\xb9",
      140 => "\xc5\x92",

      142 => "\xc5\xbd",


      145 => "\xe2\x80\x98",
      146 => "\xe2\x80\x99",
      147 => "\xe2\x80\x9c",
      148 => "\xe2\x80\x9d",
      149 => "\xe2\x80\xa2",
      150 => "\xe2\x80\x93",
      151 => "\xe2\x80\x94",
      152 => "\xcb\x9c",
      153 => "\xe2\x84\xa2",
      154 => "\xc5\xa1",
      155 => "\xe2\x80\xba",
      156 => "\xc5\x93",

      158 => "\xc5\xbe",
      159 => "\xc5\xb8"
    );

    protected static $brokenUtf8ToUtf8 = array(
      "\xc2\x80" => "\xe2\x82\xac",

      "\xc2\x82" => "\xe2\x80\x9a",
      "\xc2\x83" => "\xc6\x92",
      "\xc2\x84" => "\xe2\x80\x9e",
      "\xc2\x85" => "\xe2\x80\xa6",
      "\xc2\x86" => "\xe2\x80\xa0",
      "\xc2\x87" => "\xe2\x80\xa1",
      "\xc2\x88" => "\xcb\x86",
      "\xc2\x89" => "\xe2\x80\xb0",
      "\xc2\x8a" => "\xc5\xa0",
      "\xc2\x8b" => "\xe2\x80\xb9",
      "\xc2\x8c" => "\xc5\x92",

      "\xc2\x8e" => "\xc5\xbd",


      "\xc2\x91" => "\xe2\x80\x98",
      "\xc2\x92" => "\xe2\x80\x99",
      "\xc2\x93" => "\xe2\x80\x9c",
      "\xc2\x94" => "\xe2\x80\x9d",
      "\xc2\x95" => "\xe2\x80\xa2",
      "\xc2\x96" => "\xe2\x80\x93",
      "\xc2\x97" => "\xe2\x80\x94",
      "\xc2\x98" => "\xcb\x9c",
      "\xc2\x99" => "\xe2\x84\xa2",
      "\xc2\x9a" => "\xc5\xa1",
      "\xc2\x9b" => "\xe2\x80\xba",
      "\xc2\x9c" => "\xc5\x93",

      "\xc2\x9e" => "\xc5\xbe",
      "\xc2\x9f" => "\xc5\xb8"
    );

    protected static $utf8ToWin1252 = array(
      "\xe2\x82\xac" => "\x80",

      "\xe2\x80\x9a" => "\x82",
      "\xc6\x92"     => "\x83",
      "\xe2\x80\x9e" => "\x84",
      "\xe2\x80\xa6" => "\x85",
      "\xe2\x80\xa0" => "\x86",
      "\xe2\x80\xa1" => "\x87",
      "\xcb\x86"     => "\x88",
      "\xe2\x80\xb0" => "\x89",
      "\xc5\xa0"     => "\x8a",
      "\xe2\x80\xb9" => "\x8b",
      "\xc5\x92"     => "\x8c",

      "\xc5\xbd"     => "\x8e",


      "\xe2\x80\x98" => "\x91",
      "\xe2\x80\x99" => "\x92",
      "\xe2\x80\x9c" => "\x93",
      "\xe2\x80\x9d" => "\x94",
      "\xe2\x80\xa2" => "\x95",
      "\xe2\x80\x93" => "\x96",
      "\xe2\x80\x94" => "\x97",
      "\xcb\x9c"     => "\x98",
      "\xe2\x84\xa2" => "\x99",
      "\xc5\xa1"     => "\x9a",
      "\xe2\x80\xba" => "\x9b",
      "\xc5\x93"     => "\x9c",

      "\xc5\xbe"     => "\x9e",
      "\xc5\xb8"     => "\x9f"
    );

    static function toUTF8($text){
      /**
       * Function Encoding::toUTF8
       *
       * This function leaves UTF-8 characters alone, while converting
       * almost all non-UTF8 to UTF8.
       *
       * It assumes that the encoding of the original string is
       * either Windows-1252 or ISO 8859-1.
       *
       * It may fail to convert characters to UTF-8 if they fall
       * into one of these scenarios:
       *
       * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
       *    are followed by any of these:  ("group B")
       *                                    ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿
       *
       *    For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
       *    The "«" (%AB) character will be converted, but the "É"
       *    followed by "»" (%C9%BB) is also a valid unicode
       *    character, and will be left unchanged.
       *
       * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO
       *    characters from group B,
       *
       * 3) when any of these: ðñòó  are followed by THREE
       *    characters from group B.
       *
       * @name toUTF8
       * @param string $text  Any string.
       * @return string  The same string, UTF-8 encoded
       *
       */

      if(is_array($text))
      {
        foreach($text as $k => $v)
        {
          $text[$k] = self::toUTF8($v);
        }
        return $text;
      } elseif(is_string($text)) {

        $max = strlen($text);
        $buf = "";
        for($i = 0; $i < $max; $i++){
            $c1 = $text{$i};
            if($c1>="\xc0"){ // Should be converted to UTF-8, if it's not UTF-8 already
              $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
              $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
              $c4 = $i+3 >= $max? "\x00" : $text{$i+3};
                if($c1 >= "\xc0" & $c1 <= "\xdf"){ // Looks like 2 bytes UTF-8
                    if($c2 >= "\x80" && $c2 <= "\xbf"){ // Yeah, almost sure it's UTF-8 already
                        $buf .= $c1 . $c2;
                        $i++;
                    } else { // Not valid UTF-8.  Convert it.
                        $cc1 = (chr(ord($c1) / 64) | "\xc0");
                        $cc2 = ($c1 & "\x3f") | "\x80";
                        $buf .= $cc1 . $cc2;
                    }
                } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ // Looks like 3 bytes UTF-8
                    if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ // Yeah, almost sure it's UTF-8 already
                        $buf .= $c1 . $c2 . $c3;
                        $i = $i + 2;
                    } else { // Not valid UTF-8.  Convert it.
                        $cc1 = (chr(ord($c1) / 64) | "\xc0");
                        $cc2 = ($c1 & "\x3f") | "\x80";
                        $buf .= $cc1 . $cc2;
                    }
                } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ // Looks like 4 bytes UTF-8
                    if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ // Yeah, almost sure it's UTF-8 already
                        $buf .= $c1 . $c2 . $c3;
                        $i = $i + 2;
                    } else { // Not valid UTF-8.  Convert it.
                        $cc1 = (chr(ord($c1) / 64) | "\xc0");
                        $cc2 = ($c1 & "\x3f") | "\x80";
                        $buf .= $cc1 . $cc2;
                    }
                } else { // It doesn't look like UTF-8, but should be converted
                        $cc1 = (chr(ord($c1) / 64) | "\xc0");
                        $cc2 = (($c1 & "\x3f") | "\x80");
                        $buf .= $cc1 . $cc2;
                }
            } elseif(($c1 & "\xc0") == "\x80"){ // Needs conversion
                  if(isset(self::$win1252ToUtf8[ord($c1)])) { // Found in Windows 1252 special cases
                      $buf .= self::$win1252ToUtf8[ord($c1)];
                  } else {
                    $cc1 = (chr(ord($c1) / 64) | "\xc0");
                    $cc2 = (($c1 & "\x3f") | "\x80");
                    $buf .= $cc1 . $cc2;
                  }
            } else { // It doesn't need convesion
                $buf .= $c1;
            }
        }
        return $buf;
      } else {
        return $text;
      }
    }

    static function toWin1252($text) {
      if(is_array($text)) {
        foreach($text as $k => $v) {
          $text[$k] = self::toWin1252($v);
        }
        return $text;
      } elseif(is_string($text)) {
        return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
      } else {
        return $text;
      }
    }

    static function toISO8859($text) {
      return self::toWin1252($text);
    }

    static function toLatin1($text) {
      return self::toWin1252($text);
    }

    static function fixUTF8($text){
      if(is_array($text)) {
        foreach($text as $k => $v) {
          $text[$k] = self::fixUTF8($v);
        }
        return $text;
      }

      $last = "";
      while($last <> $text){
        $last = $text;
        $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
      }
      $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
      return $text;
    }

    static function UTF8FixWin1252Chars($text){
      // If you received an UTF-8 string that was converted
      // from Windows-1252 as it was ISO8859-1
      // (ignoring Windows-1252 chars from 80 to 9F) use
      // this function to fix it.
      // See: http://en.wikipedia.org/wiki/Windows-1252

      return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
    }

    static function removeBOM($str=""){
      if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
        $str=substr($str, 3);
      }
      return $str;
    }
  }
?>

For using it, you need to include the script with this class and use it like:

Encoding::toUtf8('Bankdrücken');

Question 2

I finally got it working...

Before I do the glob();, I am converting the string into ISO 8859-1 ... but I actually do have to encode it back after the glob() so the callback (JSON method) is able to work with the data...

Alexey palamar´s solution did kind of work, but as the string already is UTF-8 I had to .. it worked like this:

$string = $encoding->toISO8859($string);

or

$string = iconv("UTF-8", "ISO-8859-1", $string);

Then do the glob()

$files = glob($path);

Reconvert:

for($i=0; $i < $n_files ; $i++){

    $files[$i] = utf8_encode($files[$i]);
}