1// Alias: ca 2// From: https://snowballstem.org/algorithms/catalan/stemmer.html 3// Author: Israel Olalla 4// Licence: 3-clause BSD, as per https://snowballstem.org/license.html 5 6routines ( 7 cleaning mark_regions 8 R1 R2 9 attached_pronoun 10 standard_suffix 11 verb_suffix 12 residual_suffix 13) 14 15externals ( stem ) 16 17integers ( p1 p2 ) 18 19groupings ( v ) 20 21stringescapes {} 22 23/* special characters */ 24 25stringdef a' '{U+00E1}' // a-acute 26stringdef a` '{U+00E0}' // a-grave 27stringdef c, '{U+00E7}' // c-cedilla 28stringdef e' '{U+00E9}' // e-acute 29stringdef e` '{U+00E8}' // e-grave 30stringdef i' '{U+00ED}' // i-acute 31stringdef i` '{U+00EC}' // i-grave 32stringdef i" '{U+00EF}' // i-diaeresis 33stringdef o' '{U+00F3}' // o-acute 34stringdef o` '{U+00F2}' // o-grave 35stringdef u' '{U+00FA}' // u-acute 36stringdef u" '{U+00FC}' // u-diaeresis 37stringdef - '{U+002D}' // - per aggeminades 38stringdef . '{U+00B7}' // - per l aggeminades 39 40define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}' 41 42define mark_regions as ( 43 44 $p1 = limit 45 $p2 = limit // defaults 46 47 do ( 48 gopast v gopast non-v setmark p1 49 gopast v gopast non-v setmark p2 50 ) 51) 52 53define cleaning as repeat ( 54 [substring] among( 55 '{a'}' (<- 'a') 56 '{a`}' (<- 'a') 57 '{e'}' (<- 'e') 58 '{e`}' (<- 'e') 59 '{i'}' (<- 'i') 60 '{i`}' (<- 'i') 61 '{o'}' (<- 'o') 62 '{o`}' (<- 'o') 63 '{u'}' (<- 'u') 64 '{u"}' (<- 'u') 65 '{i"}' (<- 'i') 66 '{.}' (<- '.') 67 '' (next) 68 ) //or next 69) 70 71backwardmode ( 72 73 define R1 as $p1 <= cursor 74 define R2 as $p2 <= cursor 75 76 define attached_pronoun as ( 77 [substring] among ( 78 '{'}s' '{'}hi' '{'}ho' '{'}l' '{'}ls' 79 '{-}ls' '{-}la' '{-}les' '{-}li' 80 'vos' 'se' 'nos' '{-}nos' '{-}us' 'us' 81 '{'}n' '{'}ns' '{-}n' '{-}ns' 82 '{'}m' '{-}me' '{-}m' 83 '{-}te' '{'}t' 84 'li' 'lo' 'los' 85 'me' 'sela' 'selo' 'selas' 'selos' 'le' 86 'la' 'las' 'les' 'ens' 'ho' 'hi' 87 (R1 delete) 88 ) 89 ) 90 91 define standard_suffix as ( 92 [substring] among( 93 'ar' 'atge' 'formes' 'icte' 'ictes' 94 'ell' 'ells' 'ella' '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta' 95 'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls' 96 'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius' 97 'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste' 98 'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis' 99 '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{c,}a' 'nces' '{o'}' 'dor' 'all' 100 'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu' 101 '{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar' 102 'itar' 'ables' 'adors' 'idores' 'idors' 103 'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es' 104 'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris' 105 'ats' 'ions' 'ota' 'isam' 'ors' 'ora' 'ores' 'isament' 106 'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes' 107 'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies' 108 '{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles' 109 'assa' 'asses' 'assos' 110 'ent' 'ents' 111 '{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin' 112 'ims' 'ima' 'imes' 113 'isme' 'ista' 'ismes' 'istes' 114 'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius' 115 'oses' 'osos' 'ient' 'otes' 'ots' 116 (R1 delete) 117 'acions' 'ada' 'ades' 118 (R2 delete) 119 'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques' 120 (R2 <- 'log') 121 'ic' 'ica' 'ics' 'iques' 122 (R2 <- 'ic') 123 'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima' 124 (R1 <- 'c') 125 ) 126 ) 127 128 define verb_suffix as ( 129 [substring] among( 130 'ador' 'adora' 'adors' 'adores' 're' 'ie' 131 'ent' 'ents' 'udes' 'ar{a`}' 'eren' 132 'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' 133 'aria' 'arian' 'arien' 'aries' 'ar{a`}s' 134 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara' 135 'ar{e'}' 'ar{e'}s' 136 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' 137 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' 138 'er{e'}' 'er' 'erau' 'erass' 139 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' 140 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' 141 'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu' 142 'ia' 'ies' '{i'}em' '{i`}eu' 'ien' 143 'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats' 144 'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu' 145 'essen' 'esses' 'assen' 'asses' 'assim' 'assiu' 146 '{e'}ssen' '{e'}sseu' '{e'}ssim' '{e'}ssiu' '{e'}ssem' 147 '{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren' 148 'ar{i'}em' 'ar{i'}eu' 149 'areu' 'aren' 'ant' '{i"}m' '{i"}u' 150 '{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es' 151 'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da' 152 'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its' 153 'id' 'ids' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' 154 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' 155 'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as' 156 'ieu' 'ii' 'io' 'i{a`}' 157 'ess' 'essin' 'essis' 'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu' 158 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' 159 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' 160 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' 161 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques' 162 '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' 163 'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien' 164 'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu' 165 'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis' 166 'eixen' 'eixo' 'isin' 'isis' 'esques' 'sis' 'sin' 167 'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen' 168 'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim' 169 '{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu' 170 '{i"}ra' '{i"}ren' '{i"}res' 171 '{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x' 172 'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis' 173 'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s' 174 (R1 delete) 175 'ando' 176 (R2 delete) 177 ) 178 ) 179 180 define residual_suffix as ( 181 [substring] among( 182 'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu' 183 'is' 'i' 'ir' 's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it' 184 (R1 delete) 185 'iqu' 186 (R1 <- 'ic') 187 ) 188 ) 189) 190 191define stem as ( 192 do mark_regions 193 backwards ( 194 do attached_pronoun 195 do ( standard_suffix or 196 verb_suffix 197 ) 198 do residual_suffix 199 ) 200 do cleaning 201) 202/* 203 First works 2010/07/19 204 First Gramatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_catalana 205 Sufix list: http://wapedia.mobi/ca/Llista_de_sufixos 206 Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0 207*/ 208