%PDF- %PDF-
Direktori : /home/graphicd/public_html/vebto/vendor/teamtnt/tntsearch/src/Stemmer/ |
Current File : /home/graphicd/public_html/vebto/vendor/teamtnt/tntsearch/src/Stemmer/ItalianStemmer.php |
<?php namespace TeamTNT\TNTSearch\Stemmer; /* * The following code, downloaded from <https://www.drupal.org/project/italianstemmer>, * was originally written by Roberto Mirizzi (<roberto.mirizzi@gmail.com>, * <http://sisinflab.poliba.it/mirizzi/>) in February 2007. It was the PHP5 implementation * of Martin Porter's stemming algorithm for Italian language. This algorithm can be found * at the address: <http://snowball.tartarus.org/algorithms/italian/stemmer.html>. * * It was rewritten in March 2017 for TNTSearch by GaspariLab S.r.l., <dev@gasparilab.it>. */ /* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ class ItalianStemmer implements Stemmer { private static $cache = []; private static $vocali = ['a', 'e', 'i', 'o', 'u', 'à', 'è', 'ì', 'ò', 'ù']; private static $consonanti = [ 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z', 'I', 'U', ]; private static $accenti_acuti = ['á', 'é', 'í', 'ó', 'ú']; private static $accenti_gravi = ['à', 'è', 'ì', 'ò', 'ù']; private static $suffissi_step0 = [ 'ci', 'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'sene', 'gliela', 'gliele', 'glieli', 'glielo', 'gliene', 'mela', 'mele', 'meli', 'melo', 'mene', 'tela', 'tele', 'teli', 'telo', 'tene', 'cela', 'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene', ]; private static $suffissi_step1_a = [ 'anza', 'anze', 'ico', 'ici', 'ica', 'ice', 'iche', 'ichi', 'ismo', 'ismi', 'abile', 'abili', 'ibile', 'ibili', 'ista', 'iste', 'isti', 'istà', 'istè', 'istì', 'oso', 'osi', 'osa', 'ose', 'mente', 'atrice', 'atrici', 'ante', 'anti', ]; private static $suffissi_step1_b = ['azione', 'azioni', 'atore', 'atori']; private static $suffissi_step1_c = ['logia', 'logie']; private static $suffissi_step1_d = ['uzione', 'uzioni', 'usione', 'usioni']; private static $suffissi_step1_e = ['enza', 'enze']; private static $suffissi_step1_f = ['amento', 'amenti', 'imento', 'imenti']; private static $suffissi_step1_g = ['amente']; private static $suffissi_step1_h = ['ità']; private static $suffissi_step1_i = ['ivo', 'ivi', 'iva', 'ive']; private static $suffissi_step2 = [ 'ammo', 'ando', 'ano', 'are', 'arono', 'asse', 'assero', 'assi', 'assimo', 'ata', 'ate', 'ati', 'ato', 'ava', 'avamo', 'avano', 'avate', 'avi', 'avo', 'emmo', 'enda', 'ende', 'endi', 'endo', 'erà', 'erai', 'eranno', 'ere', 'erebbe', 'erebbero', 'erei', 'eremmo', 'eremo', 'ereste', 'eresti', 'erete', 'erò', 'erono', 'essero', 'ete', 'eva', 'evamo', 'evano', 'evate', 'evi', 'evo', 'Yamo', 'iamo', 'immo', 'irà', 'irai', 'iranno', 'ire', 'irebbe', 'irebbero', 'irei', 'iremmo', 'iremo', 'ireste', 'iresti', 'irete', 'irò', 'irono', 'isca', 'iscano', 'isce', 'isci', 'isco', 'iscono', 'issero', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivamo', 'ivano', 'ivate', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'ar', 'ir', ]; private static $ante_suff_a = ['ando', 'endo']; private static $ante_suff_b = ['ar', 'er', 'ir']; public function __construct() { usort(self::$suffissi_step0, function($a,$b) { return mb_strlen($a)>mb_strlen($b) ? -1 : 1; }); usort(self::$suffissi_step1_a, function($a,$b) { return mb_strlen($a)>mb_strlen($b) ? -1 : 1;}); usort(self::$suffissi_step2, function($a,$b) { return mb_strlen($a)>mb_strlen($b) ? -1 : 1;}); } /** * Gets the stem of $word. * * @param string $word * * @return string */ public static function stem($word) { $word = mb_strtolower($word); // Check for invalid characters preg_match('#.#u', $word); if (preg_last_error() !== 0) { throw new \InvalidArgumentException('Word "'.$word.'" seems to be errornous. Error code from preg_last_error(): '.preg_last_error()); } if (!isset(self::$cache[$word])) { $result = self::getStem($word); self::$cache[$word] = $result; } return self::$cache[$word]; } /** * @param $word * * @return string */ private static function getStem($word) { $str = self::trim($word); $str = self::toLower($str); $str = self::replaceAccAcuti($str); $str = self::putUAfterQToUpper($str); $str = self::IUBetweenVowToUpper($str); $step0 = self::step0($str); $step1 = self::step1($step0); $step2 = self::step2($step0, $step1); $step3a = self::step3a($step2); $step3b = self::step3b($step3a); $step4 = self::step4($step3b); return $step4; } private static function trim($str) { return trim($str); } private static function toLower($str) { return strtolower($str); } private static function replaceAccAcuti($str) { return str_replace(self::$accenti_acuti, self::$accenti_gravi, $str); //strtr } private static function putUAfterQToUpper($str) { return str_replace('qu', 'qU', $str); } private static function IUBetweenVowToUpper($str) { $pattern = '/([aeiouàèìòù])([iu])([aeiouàèìòù])/'; return preg_replace_callback($pattern, function ($matches) { return strtoupper($matches[0]); }, $str); } private static function returnRV($str) { /* If the second letter is a consonant, RV is the region after the next following vowel, or if the first two letters are vowels, RV is the region after the next consonant, and otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found. Example: m a c h o [ho] o l i v a [va] t r a b a j o [bajo] á u r e o [eo] prezzo sprezzante */ if (mb_strlen($str) < 2) { return ''; } //$str; if (in_array($str[1], self::$consonanti)) { $str = mb_substr($str, 2); $str = strpbrk($str, implode(self::$vocali)); return mb_substr($str, 1); //secondo me devo mettere 1 } elseif (in_array($str[0], self::$vocali) && in_array($str[1], self::$vocali)) { $str = strpbrk($str, implode(self::$consonanti)); return mb_substr($str, 1); } elseif (in_array($str[0], self::$consonanti) && in_array($str[1], self::$vocali)) { return mb_substr($str, 3); } } private static function returnR1($str) { /* R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel. Example: beautiful [iful] beauty [y] beau [NULL] animadversion [imadversion] sprinkled [kled] eucharist [harist] */ $pattern = '/['.implode(self::$vocali).']+'.'['.implode(self::$consonanti).']'.'(.*)/'; preg_match($pattern, $str, $matches); return count($matches) >= 1 ? $matches[1] : ''; } private static function returnR2($str) { /* R2 is the region after the first non-vowel following a vowel in R1, or is the null region at the end of the word if there is no such non-vowel. Example: beautiful [ul] beauty [NULL] beau [NULL] animadversion [adversion] sprinkled [NULL] eucharist [ist] */ $R1 = self::returnR1($str); $pattern = '/['.implode(self::$vocali).']+'.'['.implode(self::$consonanti).']'.'(.*)/'; preg_match($pattern, $R1, $matches); return count($matches) >= 1 ? $matches[1] : ''; } private static function step0($str) { //Step 0: Attached pronoun //Always do steps 0 $str_len = mb_strlen($str); $rv = self::returnRV($str); $rv_len = mb_strlen($rv); $pos = 0; foreach (self::$suffissi_step0 as $suff) { if ($rv_len - mb_strlen($suff) < 0) { continue; } $pos = mb_strpos($rv, $suff, $rv_len - mb_strlen($suff)); if ($pos !== false) { break; } } $ante_suff = mb_substr($rv, 0, $pos); $ante_suff_len = mb_strlen($ante_suff); foreach (self::$ante_suff_a as $ante_a) { if ($ante_suff_len - mb_strlen($ante_a) < 0) { continue; } $pos_a = mb_strpos($ante_suff, $ante_a, $ante_suff_len - mb_strlen($ante_a)); if ($pos_a !== false) { return mb_substr($str, 0, $pos + $str_len - $rv_len); } } foreach (self::$ante_suff_b as $ante_b) { if ($ante_suff_len - mb_strlen($ante_b) < 0) { continue; } $pos_b = mb_strpos($ante_suff, $ante_b, $ante_suff_len - mb_strlen($ante_b)); if ($pos_b !== false) { return mb_substr($str, 0, $pos + $str_len - $rv_len).'e'; } } return $str; } private static function deleteStuff($arr_suff, $str, $str_len, $where, $ovunque = false) { if ($where === 'r2') { $r = self::returnR2($str); } elseif ($where === 'rv') { $r = self::returnRV($str); } elseif ($where === 'r1') { $r = self::returnR1($str); } $r_len = mb_strlen($r); if ($ovunque) { foreach ($arr_suff as $suff) { if ($str_len - mb_strlen($suff) < 0) { continue; } $pos = mb_strpos($str, $suff, $str_len - mb_strlen($suff)); if ($pos !== false) { $pattern = '/'.$suff.'$/'; $ret_str = preg_match($pattern, $r) ? mb_substr($str, 0, $pos) : ''; if ($ret_str !== '') { return $ret_str; } break; } } } else { foreach ($arr_suff as $suff) { if ($r_len - mb_strlen($suff) < 0) { continue; } $pos = mb_strpos($r, $suff, $r_len - mb_strlen($suff)); if ($pos !== false) { return mb_substr($str, 0, $pos + $str_len - $r_len); } } } } private static function step1($str) { // Step 1: Standard suffix removal // Always do steps 1 $str_len = mb_strlen($str); // Delete if in R1, if preceded by 'iv', delete if in R2 (and if further preceded by 'at', delete if in R2), // otherwise, if preceded by 'os', 'ic' or 'abil', delete if in R2 if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_g, $str, $str_len, 'r1'))) { if (!empty($ret_str1 = self::deleteStuff(['iv'], $ret_str, mb_strlen($ret_str), 'r2'))) { if (!empty($ret_str2 = self::deleteStuff(['at'], $ret_str1, mb_strlen($ret_str1), 'r2'))) { return $ret_str2; } else { return $ret_str1; } } elseif (!empty( $ret_str1 = self::deleteStuff(['os', 'ic', 'abil'], $ret_str, mb_strlen($ret_str), 'r2') )) { return $ret_str1; } else { return $ret_str; } } // Delete if in R2 if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_a, $str, $str_len, 'r2', true))) { return $ret_str; } // Delete if in R2, if preceded by 'ic', delete if in R2 if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_b, $str, $str_len, 'r2'))) { if (!empty($ret_str1 = self::deleteStuff(['ic'], $ret_str, mb_strlen($ret_str), 'r2'))) { return $ret_str1; } else { return $ret_str; } } // Replace with 'log' if in R2 if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_c, $str, $str_len, 'r2'))) { return $ret_str.'log'; } // Replace with 'u' if in R2 if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_d, $str, $str_len, 'r2'))) { return $ret_str.'u'; } // Replace with 'ente' if in R2 if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_e, $str, $str_len, 'r2'))) { return $ret_str.'ente'; } // Delete if in RV if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_f, $str, $str_len, 'rv'))) { return $ret_str; } // Delete if in R2, if preceded by 'abil', 'ic' or 'iv', delete if in R2 if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_h, $str, $str_len, 'r2'))) { if (!empty($ret_str1 = self::deleteStuff(['abil', 'ic', 'iv'], $ret_str, mb_strlen($ret_str), 'r2'))) { return $ret_str1; } else { return $ret_str; } } // Delete if in R2, if preceded by 'at', delete if in R2 (and if further preceded by 'ic', delete if in R2) if (!empty($ret_str = self::deleteStuff(self::$suffissi_step1_i, $str, $str_len, 'r2'))) { if (!empty($ret_str1 = self::deleteStuff(['at'], $ret_str, mb_strlen($ret_str), 'r2'))) { if (!empty($ret_str2 = self::deleteStuff(['ic'], $ret_str1, mb_strlen($ret_str1), 'r2'))) { return $ret_str2; } else { return $ret_str1; } } else { return $ret_str; } } return $str; } private static function step2($str, $str_step1) { //Step 2: Verb suffixes //Do step 2 if no ending was removed by step 1 if ($str != $str_step1) { return $str_step1; } $str_len = mb_strlen($str); if (!empty($ret_str = self::deleteStuff(self::$suffissi_step2, $str, $str_len, 'rv'))) { return $ret_str; } return $str; } private static function step3a($str) { // Step 3a: Delete a final 'a', 'e', 'i', 'o',' à', 'è', 'ì' or 'ò' if it is in RV, // and a preceding 'i' if it is in RV ('crocchi' -> 'crocch', 'crocchio' -> 'crocch') // Always do steps 3a $vocale_finale = ['a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò']; $str_len = mb_strlen($str); if (!empty($ret_str = self::deleteStuff($vocale_finale, $str, $str_len, 'rv'))) { if (!empty($ret_str1 = self::deleteStuff(['i'], $ret_str, mb_strlen($ret_str), 'rv'))) { return $ret_str1; } else { return $ret_str; } } return $str; } private static function step3b($str) { // Step 3b: Replace final 'ch' (or 'gh') with 'c' (or 'g') if in 'RV' ('crocch' -> 'crocc') // Always do steps 3b $rv = self::returnRV($str); $pattern = '/([cg])h$/'; return mb_substr($str, 0, mb_strlen($str) - mb_strlen($rv)) . preg_replace_callback( $pattern, function ($matches) { return $matches[0]; }, $rv ); } private static function step4($str) { // Step 4: Finally, turn I and U back into lower case return strtolower($str); } }