%PDF- %PDF-
Direktori : /home/graphicd/public_html/vebto/vendor/teamtnt/tntsearch/src/Stemmer/ |
Current File : /home/graphicd/public_html/vebto/vendor/teamtnt/tntsearch/src/Stemmer/PortugueseStemmer.php |
<?php namespace TeamTNT\TNTSearch\Stemmer; /** * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /** * This is a reimplementation of the Porter Stemmer Algorithm for Portuguese. * This script is based on the implementation found on <https://github.com/wamania/php-stemmer> * and has been rewriten to work with TNTSearch by Lucas Padilha <https://github.com/LucasPadilha> * * Takes a word and reduces it to its Portuguese stem using the Porter stemmer algorithm. * * References: * - http://snowball.tartarus.org/algorithms/porter/stemmer.html * - http://snowball.tartarus.org/algorithms/portuguese/stemmer.html * * Usage: * $stem = PortugueseStemmer::stem($word); * * @author Lucas Padilha <https://github.com/LucasPadilha> */ class PortugueseStemmer implements Stemmer { /** * UTF-8 Case lookup table * * This lookuptable defines the upper case letters to their correspponding * lower case letter in UTF-8 * * @author Andreas Gohr <andi@splitbrain.org> */ private static $utf8_lower_to_upper = array( 0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042, 0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100, 0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393, 0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C, 0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F, 0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E, 0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3, 0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A, 0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9, 0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C, 0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4, 0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164, 0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156, 0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118, 0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128, 0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428, 0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055, 0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A, 0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC, 0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0, 0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D, 0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0, 0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5, 0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA, 0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045, 0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F, 0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048, 0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6, 0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407, 0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395, 0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396, 0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051, 0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408, 0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F, 0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126, 0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C, 0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E, 0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB, 0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421, 0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A, 0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102, 0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9, 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, ); private static $vowels = array('a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'â', 'ê', 'ô'); public static function stem($word) { // we do ALL in UTF-8 if (!self::check($word)) { throw new \Exception('Word must be in UTF-8'); } $word = self::strtolower($word); $word = self::str_replace(array('ã', 'õ'), array('a~', 'o~'), $word); $rv = ''; $rvIndex = ''; self::rv($word, $rv, $rvIndex); $r1 = ''; $r1Index = ''; self::r1($word, $r1, $r1Index); $r2 = ''; $r2Index = ''; self::r2($r1, $r1Index, $r2, $r2Index); $initialWord = $word; self::step1($word, $r1Index, $r2Index, $rvIndex); if ($initialWord == $word) { self::step2($word, $rvIndex); } if ($initialWord != $word) { self::step3($word, $rvIndex); } else { self::step4($word, $rvIndex); } self::step5($word, $rvIndex); self::finish($word); return $word; } /** * R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. */ private static function r1($word, &$r1, &$r1Index) { list($index, $value) = self::rx($word); $r1 = $value; $r1Index = $index; return true; } /** * R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. */ private static function r2($r1, $r1Index, &$r2, &$r2Index) { list($index, $value) = self::rx($r1); $r2 = $value; $r2Index = $r1Index + $index; return true; } /** * Common function for R1 and R2 * Search the region after the first non-vowel following a vowel in $word, or the end of the word if there is no such non-vowel. * R1 : $in = $this->word * R2 : $in = R1 */ private static function rx($in) { $length = self::strlen($in); // Defaults $value = ''; $index = $length; // Search all vowels $vowels = array(); for ($i = 0; $i < $length; $i++) { $letter = self::substr($in, $i, 1); if (in_array($letter, static::$vowels)) { $vowels[] = $i; } } // Search the non-vowel following a vowel foreach ($vowels as $position) { $after = $position + 1; $letter = self::substr($in, $after, 1); if (!in_array($letter, static::$vowels)) { $index = $after + 1; $value = self::substr($in, ($after+1)); break; } } return array($index, $value); } /** * Used by spanish, italian, portuguese, etc (but not by french) * * If the second letter is a consonant, RV is the region after the next following vowel, * or if the first two letters are vowels, RV is the region after the next consonant, * and otherwise (consonant-vowel case) RV is the region after the third letter. * But RV is the end of the word if these positions cannot be found. */ private static function rv($word, &$rv, &$rvIndex) { $length = self::strlen($word); if ($length < 3) { return true; } $first = self::substr($word, 0, 1); $second = self::substr($word, 1, 1); // If the second letter is a consonant, RV is the region after the next following vowel, if (!in_array($second, static::$vowels)) { for ($i = 2; $i < $length; $i++) { $letter = self::substr($word, $i, 1); if (in_array($letter, static::$vowels)) { $rv = self::substr($word, ($i + 1)); $rvIndex = $i + 1; return true; } } } // or if the first two letters are vowels, RV is the region after the next consonant, if ((in_array($first, static::$vowels)) && (in_array($second, static::$vowels))) { for ($i = 2; $i < $length; $i++) { $letter = self::substr($word, $i, 1); if (!in_array($letter, static::$vowels)) { $rv = self::substr($word, ($i + 1)); $rvIndex = $i + 1; return true; } } } // and otherwise (consonant-vowel case) RV is the region after the third letter. if ((!in_array($first, static::$vowels)) && (in_array($second, static::$vowels))) { $rv = self::substr($word, 3); $rvIndex = 3; return true; } return false; } private static function inRv($position, $rvIndex) { return ($position >= $rvIndex); } private static function inR1($position, $r1Index) { return ($position >= $r1Index); } private static function inR2($position, $r2Index) { return ($position >= $r2Index); } private static function searchIfInRv($word, $suffixes, $rvIndex) { return self::search($word, $suffixes, $rvIndex); } private static function searchIfInR2($word, $suffixes, $r2Index) { return self::search($word, $suffixes, $r2Index); } private static function search($word, $suffixes, $offset = 0) { $length = self::strlen($word); if ($offset > $length) { return false; } foreach ($suffixes as $suffix) { if ((($position = self::strrpos($word, $suffix, $offset)) !== false) && ((self::strlen($suffix) + $position) == $length)) { return $position; } } return false; } /** * Step 1: Standard suffix removal */ private static function step1(&$word, $r1Index, $r2Index, $rvIndex) { // delete if in R2 if (($position = self::search($word, array('amentos', 'imentos', 'adoras', 'adores', 'amento', 'imento', 'adora', 'istas', 'ismos', 'antes', 'ância', 'ezas', 'eza', 'icos', 'icas', 'ismo', 'ável', 'ível', 'ista', 'oso', 'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es' , 'ante'))) !== false) { if (self::inR2($position, $r2Index)) { $word = self::substr($word, 0, $position); } return true; } // replace with log if in R2 if (($position = self::search($word, array('logías', 'logía'))) !== false) { if (self::inR2($position, $r2Index)) { $word = preg_replace('#(logías|logía)$#u', 'log', $word); } return true; } // replace with u if in R2 if (($position = self::search($word, array('uciones', 'ución'))) !== false) { if (self::inR2($position, $r2Index)) { $word = preg_replace('#(uciones|ución)$#u', 'u', $word); } return true; } // replace with ente if in R2 if (($position = self::search($word, array('ências', 'ência'))) !== false) { if (self::inR2($position, $r2Index)) { $word = preg_replace('#(ências|ência)$#u', 'ente', $word); } return true; } // delete if in R1 // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, // if preceded by os, ic or ad, delete if in R2 if (($position = self::search($word, array('amente'))) !== false) { // delete if in R1 if (self::inR1($position, $r1Index)) { $word = self::substr($word, 0, $position); } // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, if (($position2 = self::searchIfInR2($word, array('iv'), $r2Index)) !== false) { $word = self::substr($word, 0, $position2); if (($position3 = self::searchIfInR2($word, array('at'), $r2Index)) !== false) { $word = self::substr($word, 0, $position3); } // if preceded by os, ic or ad, delete if in R2 } elseif (($position4 = self::searchIfInR2($word, array('os', 'ic', 'ad'), $r2Index)) !== false) { $word = self::substr($word, 0, $position4); } return true; } // delete if in R2 // if preceded by ante, avel or ível, delete if in R2 if (($position = self::search($word, array('mente'))) !== false) { // delete if in R2 if (self::inR2($position, $r2Index)) { $word = self::substr($word, 0, $position); } // if preceded by ante, avel or ível, delete if in R2 if (($position2 = self::searchIfInR2($word, array('ante', 'avel', 'ível'), $r2Index)) != false) { $word = self::substr($word, 0, $position2); } return true; } // delete if in R2 // if preceded by abil, ic or iv, delete if in R2 if (($position = self::search($word, array('idades', 'idade'))) !== false) { // delete if in R2 if (self::inR2($position, $r2Index)) { $word = self::substr($word, 0, $position); } // if preceded by abil, ic or iv, delete if in R2 if (($position2 = self::searchIfInR2($word, array('abil', 'ic', 'iv'), $r2Index)) !== false) { $word = self::substr($word, 0, $position2); } return true; } // delete if in R2 // if preceded by at, delete if in R2 if (($position = self::search($word, array('ivas', 'ivos', 'iva', 'ivo'))) !== false) { // delete if in R2 if (self::inR2($position, $r2Index)) { $word = self::substr($word, 0, $position); } // if preceded by at, delete if in R2 if (($position2 = self::searchIfInR2($word, array('at'), $r2Index)) !== false) { $word = self::substr($word, 0, $position2); } return true; } // replace with ir if in RV and preceded by e if (($position = self::search($word, array('iras', 'ira'))) !== false) { if (self::inRv($position, $rvIndex)) { $before = $position - 1; $letter = self::substr($word, $before, 1); if ($letter == 'e') { $word = preg_replace('#(iras|ira)$#u', 'ir', $word); } } return true; } return false; } /** * Step 2: Verb suffixes * Search for the longest among the following suffixes in RV, and if found, delete. */ private static function step2(&$word, $rvIndex) { if (($position = self::searchIfInRv($word, array('aríamos', 'eríamos', 'iríamos', 'ássemos', 'êssemos', 'íssemos', 'aríeis', 'eríeis', 'iríeis', 'ásseis', 'ésseis', 'ísseis', 'áramos', 'éramos', 'íramos', 'ávamos', 'aremos', 'eremos', 'iremos', 'ariam', 'eriam', 'iriam', 'assem', 'essem', 'issem', 'arias', 'erias', 'irias', 'ardes', 'erdes', 'irdes', 'asses', 'esses', 'isses', 'astes', 'estes', 'istes', 'áreis', 'areis', 'éreis', 'ereis', 'íreis', 'ireis', 'áveis', 'íamos', 'armos', 'ermos', 'irmos', 'aria', 'eria', 'iria', 'asse', 'esse', 'isse', 'aste', 'este', 'iste', 'arei', 'erei', 'irei', 'adas', 'idas', 'aram', 'eram', 'iram', 'avam', 'arem', 'erem', 'irem', 'ando', 'endo', 'indo', 'ara~o', 'era~o', 'ira~o', 'arás', 'aras', 'erás', 'eras', 'irás', 'avas', 'ares', 'eres', 'ires', 'íeis', 'ados', 'idos', 'ámos', 'amos', 'emos', 'imos', 'iras', 'ada', 'ida', 'ará', 'ara', 'erá', 'era', 'irá', 'ava', 'iam', 'ado', 'ido', 'ias', 'ais', 'eis', 'ira', 'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou'), $rvIndex)) !== false) { $word = self::substr($word, 0, $position); return true; } return false; } /** * Step 3: d-suffixes * */ private static function step3(&$word, $rvIndex) { // Delete suffix i if in RV and preceded by c if (self::searchIfInRv($word, array('i'), $rvIndex) !== false) { $letter = self::substr($word, -2, 1); if ($letter == 'c') { $word = self::substr($word, 0, -1); } return true; } return false; } /** * Step 4 */ private static function step4(&$word, $rvIndex) { // If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it if (($position = self::searchIfInRv($word, array('os', 'a', 'i', 'o','á', 'í', 'ó'), $rvIndex)) !== false) { $word = self::substr($word, 0, $position); return true; } return false; } /** * Step 5 */ private static function step5(&$word, $rvIndex) { // If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i). if (self::searchIfInRv($word, array('e', 'é', 'ê'), $rvIndex) !== false) { $word = self::substr($word, 0, -1); if (($position2 = self::search($word, array('gu', 'ci'))) !== false) { if (self::inRv(($position2 + 1), $rvIndex)) { $word = self::substr($word, 0, -1); } } return true; } elseif (self::search($word, array('ç')) !== false) { $word = preg_replace('#(ç)$#u', 'c', $word); return true; } return false; } private static function finish(&$word) { // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. $word = self::str_replace(array('a~', 'o~'), array('ã', 'õ'), $word); } /** * Tries to detect if a string is in Unicode encoding * * @author <bmorel@ssi.fr> * @link http://www.php.net/manual/en/function.utf8-encode.php */ private static function check($str) { for ($i=0; $i<strlen($str); $i++) { if (ord($str[$i]) < 0x80) continue; # 0bbbbbbb elseif ((ord($str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb elseif ((ord($str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb elseif ((ord($str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb elseif ((ord($str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb elseif ((ord($str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b else return false; # Does not match any model for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? if ((++$i == strlen($str)) || ((ord($str[$i]) & 0xC0) != 0x80)) return false; } } return true; } /** * Unicode aware replacement for strlen() * * utf8_decode() converts characters that are not in ISO-8859-1 * to '?', which, for the purpose of counting, is alright - It's * even faster than mb_strlen. * * @author <chernyshevsky at hotmail dot com> * @see strlen() * @see utf8_decode() */ private static function strlen($string) { return strlen(utf8_decode($string)); } /** * Unicode aware replacement for substr() * * @author lmak at NOSPAM dot iti dot gr * @link http://www.php.net/manual/en/function.substr.php * @see substr() */ private static function substr($str,$start,$length=null) { $ar = array(); preg_match_all("/./u", $str, $ar); if($length != null) { return join("",array_slice($ar[0],$start,$length)); } else { return join("",array_slice($ar[0],$start)); } } /** * Unicode aware replacement for strrepalce() * * @author Harry Fuecks <hfuecks@gmail.com> * @see strreplace(); */ private static function str_replace($s,$r,$str) { if(!is_array($s)){ $s = '!'.preg_quote($s,'!').'!u'; }else{ foreach ($s as $k => $v) { $s[$k] = '!'.preg_quote($v).'!u'; } } return preg_replace($s,$r,$str); } /** * This is a unicode aware replacement for strtolower() * * Uses mb_string extension if available * * @author Andreas Gohr <andi@splitbrain.org> * @see strtolower() * @see utf8_strtoupper() */ private static function strtolower($string) { if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower')) return mb_strtolower($string,'utf-8'); //global $utf8_upper_to_lower; $utf8_upper_to_lower = array_flip(self::$utf8_lower_to_upper); $uni = self::utf8_to_unicode($string); $cnt = count($uni); for ($i=0; $i < $cnt; $i++){ if($utf8_upper_to_lower[$uni[$i]]){ $uni[$i] = $utf8_upper_to_lower[$uni[$i]]; } } return self::unicode_to_utf8($uni); } /** * This function returns any UTF-8 encoded text as a list of * Unicode values: * * @author Scott Michael Reynen <scott@randomchaos.com> * @link http://www.randomchaos.com/document.php?source=php_and_unicode * @see unicode_to_utf8() */ private static function utf8_to_unicode( &$str ) { $unicode = array(); $values = array(); $looking_for = 1; for ($i = 0; $i < strlen( $str ); $i++ ) { $this_value = ord( $str[ $i ] ); if ( $this_value < 128 ) $unicode[] = $this_value; else { if ( count( $values ) == 0 ) $looking_for = ( $this_value < 224 ) ? 2 : 3; $values[] = $this_value; if ( count( $values ) == $looking_for ) { $number = ( $looking_for == 3 ) ? ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ): ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 ); $unicode[] = $number; $values = array(); $looking_for = 1; } } } return $unicode; } /** * This function converts a Unicode array back to its UTF-8 representation * * @author Scott Michael Reynen <scott@randomchaos.com> * @link http://www.randomchaos.com/document.php?source=php_and_unicode * @see utf8_to_unicode() */ private static function unicode_to_utf8( &$str ) { if (!is_array($str)) return ''; $utf8 = ''; foreach( $str as $unicode ) { if ( $unicode < 128 ) { $utf8.= chr( $unicode ); } elseif ( $unicode < 2048 ) { $utf8.= chr( 192 + ( ( $unicode - ( $unicode % 64 ) ) / 64 ) ); $utf8.= chr( 128 + ( $unicode % 64 ) ); } else { $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) ); $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) ); $utf8.= chr( 128 + ( $unicode % 64 ) ); } } return $utf8; } /** * This is an Unicode aware replacement for strrpos * * Uses mb_string extension if available * * @author Harry Fuecks <hfuecks@gmail.com> * @see strpos() */ private static function strrpos($haystack, $needle, $offset=0) { if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strrpos')) return mb_strrpos($haystack, $needle, $offset, 'utf-8'); if (!$offset) { $ar = self::explode($needle, $haystack); $count = count($ar); if ( $count > 1 ) { return self::strlen($haystack) - self::strlen($ar[($count-1)]) - self::strlen($needle); } return false; } else { if ( !is_int($offset) ) { trigger_error('Offset must be an integer', E_USER_WARNING); return false; } $str = self::substr($haystack, $offset); if ( false !== ($pos = self::strrpos($str, $needle))){ return $pos + $offset; } return false; } } /** * Unicode aware replacement for explode * * @author Harry Fuecks <hfuecks@gmail.com> * @see explode(); */ private static function explode($sep, $str) { if ( $sep == '' ) { trigger_error('Empty delimiter',E_USER_WARNING); return FALSE; } return preg_split('!'.preg_quote($sep,'!').'!u',$str); } }