588 lines
17 KiB
Python
588 lines
17 KiB
Python
#-*- coding: utf-8 -*-
|
|
# Generated from esperanto.sbl by Snowball 3.0.1 - https://snowballstem.org/
|
|
|
|
from .basestemmer import BaseStemmer
|
|
from .among import Among
|
|
|
|
|
|
class EsperantoStemmer(BaseStemmer):
|
|
'''
|
|
This class implements the stemming algorithm defined by a snowball script.
|
|
Generated from esperanto.sbl by Snowball 3.0.1 - https://snowballstem.org/
|
|
'''
|
|
|
|
g_vowel = {u"a", u"e", u"i", u"o", u"u"}
|
|
|
|
g_aou = {u"a", u"o", u"u"}
|
|
|
|
g_digit = {u"0", u"1", u"2", u"3", u"4", u"5", u"6", u"7", u"8", u"9"}
|
|
|
|
B_foreign = False
|
|
|
|
def __r_canonical_form(self):
|
|
self.B_foreign = False
|
|
while True:
|
|
v_1 = self.cursor
|
|
try:
|
|
self.bra = self.cursor
|
|
among_var = self.find_among(EsperantoStemmer.a_0)
|
|
self.ket = self.cursor
|
|
if among_var == 1:
|
|
if not self.slice_from(u"ĉ"):
|
|
return False
|
|
elif among_var == 2:
|
|
if not self.slice_from(u"ĝ"):
|
|
return False
|
|
elif among_var == 3:
|
|
if not self.slice_from(u"ĥ"):
|
|
return False
|
|
elif among_var == 4:
|
|
if not self.slice_from(u"ĵ"):
|
|
return False
|
|
elif among_var == 5:
|
|
if not self.slice_from(u"ŝ"):
|
|
return False
|
|
elif among_var == 6:
|
|
if not self.slice_from(u"ŭ"):
|
|
return False
|
|
elif among_var == 7:
|
|
if not self.slice_from(u"a"):
|
|
return False
|
|
self.B_foreign = True
|
|
elif among_var == 8:
|
|
if not self.slice_from(u"e"):
|
|
return False
|
|
self.B_foreign = True
|
|
elif among_var == 9:
|
|
if not self.slice_from(u"i"):
|
|
return False
|
|
self.B_foreign = True
|
|
elif among_var == 10:
|
|
if not self.slice_from(u"o"):
|
|
return False
|
|
self.B_foreign = True
|
|
elif among_var == 11:
|
|
if not self.slice_from(u"u"):
|
|
return False
|
|
self.B_foreign = True
|
|
elif among_var == 12:
|
|
self.B_foreign = True
|
|
elif among_var == 13:
|
|
self.B_foreign = False
|
|
else:
|
|
if self.cursor >= self.limit:
|
|
raise lab0()
|
|
self.cursor += 1
|
|
continue
|
|
except lab0: pass
|
|
self.cursor = v_1
|
|
break
|
|
try:
|
|
if not self.B_foreign:
|
|
raise lab1()
|
|
return False
|
|
except lab1: pass
|
|
return True
|
|
|
|
def __r_initial_apostrophe(self):
|
|
self.bra = self.cursor
|
|
if not self.eq_s(u"'"):
|
|
return False
|
|
self.ket = self.cursor
|
|
if not self.eq_s(u"st"):
|
|
return False
|
|
if self.find_among(EsperantoStemmer.a_1) == 0:
|
|
return False
|
|
if self.cursor < self.limit:
|
|
return False
|
|
if not self.slice_from(u"e"):
|
|
return False
|
|
return True
|
|
|
|
def __r_pronoun(self):
|
|
self.ket = self.cursor
|
|
v_1 = self.limit - self.cursor
|
|
try:
|
|
if not self.eq_s_b(u"n"):
|
|
self.cursor = self.limit - v_1
|
|
raise lab0()
|
|
except lab0: pass
|
|
self.bra = self.cursor
|
|
if self.find_among_b(EsperantoStemmer.a_2) == 0:
|
|
return False
|
|
try:
|
|
v_2 = self.limit - self.cursor
|
|
try:
|
|
if self.cursor > self.limit_backward:
|
|
raise lab2()
|
|
raise lab1()
|
|
except lab2: pass
|
|
self.cursor = self.limit - v_2
|
|
if not self.eq_s_b(u"-"):
|
|
return False
|
|
except lab1: pass
|
|
if not self.slice_del():
|
|
return False
|
|
|
|
return True
|
|
|
|
def __r_final_apostrophe(self):
|
|
self.ket = self.cursor
|
|
if not self.eq_s_b(u"'"):
|
|
return False
|
|
self.bra = self.cursor
|
|
try:
|
|
v_1 = self.limit - self.cursor
|
|
try:
|
|
if not self.eq_s_b(u"l"):
|
|
raise lab1()
|
|
if self.cursor > self.limit_backward:
|
|
raise lab1()
|
|
if not self.slice_from(u"a"):
|
|
return False
|
|
raise lab0()
|
|
except lab1: pass
|
|
self.cursor = self.limit - v_1
|
|
try:
|
|
if not self.eq_s_b(u"un"):
|
|
raise lab2()
|
|
if self.cursor > self.limit_backward:
|
|
raise lab2()
|
|
if not self.slice_from(u"u"):
|
|
return False
|
|
raise lab0()
|
|
except lab2: pass
|
|
self.cursor = self.limit - v_1
|
|
try:
|
|
if self.find_among_b(EsperantoStemmer.a_3) == 0:
|
|
raise lab3()
|
|
try:
|
|
v_2 = self.limit - self.cursor
|
|
try:
|
|
if self.cursor > self.limit_backward:
|
|
raise lab5()
|
|
raise lab4()
|
|
except lab5: pass
|
|
self.cursor = self.limit - v_2
|
|
if not self.eq_s_b(u"-"):
|
|
raise lab3()
|
|
except lab4: pass
|
|
if not self.slice_from(u"aŭ"):
|
|
return False
|
|
raise lab0()
|
|
except lab3: pass
|
|
self.cursor = self.limit - v_1
|
|
if not self.slice_from(u"o"):
|
|
return False
|
|
except lab0: pass
|
|
return True
|
|
|
|
def __r_ujn_suffix(self):
|
|
self.ket = self.cursor
|
|
v_1 = self.limit - self.cursor
|
|
try:
|
|
if not self.eq_s_b(u"n"):
|
|
self.cursor = self.limit - v_1
|
|
raise lab0()
|
|
except lab0: pass
|
|
v_2 = self.limit - self.cursor
|
|
try:
|
|
if not self.eq_s_b(u"j"):
|
|
self.cursor = self.limit - v_2
|
|
raise lab1()
|
|
except lab1: pass
|
|
self.bra = self.cursor
|
|
if self.find_among_b(EsperantoStemmer.a_4) == 0:
|
|
return False
|
|
try:
|
|
v_3 = self.limit - self.cursor
|
|
try:
|
|
if self.cursor > self.limit_backward:
|
|
raise lab3()
|
|
raise lab2()
|
|
except lab3: pass
|
|
self.cursor = self.limit - v_3
|
|
if not self.eq_s_b(u"-"):
|
|
return False
|
|
except lab2: pass
|
|
if not self.slice_del():
|
|
return False
|
|
|
|
return True
|
|
|
|
def __r_uninflected(self):
|
|
if self.find_among_b(EsperantoStemmer.a_5) == 0:
|
|
return False
|
|
try:
|
|
v_1 = self.limit - self.cursor
|
|
try:
|
|
if self.cursor > self.limit_backward:
|
|
raise lab1()
|
|
raise lab0()
|
|
except lab1: pass
|
|
self.cursor = self.limit - v_1
|
|
if not self.eq_s_b(u"-"):
|
|
return False
|
|
except lab0: pass
|
|
return True
|
|
|
|
def __r_merged_numeral(self):
|
|
if self.find_among_b(EsperantoStemmer.a_6) == 0:
|
|
return False
|
|
if self.find_among_b(EsperantoStemmer.a_7) == 0:
|
|
return False
|
|
return True
|
|
|
|
def __r_correlative(self):
|
|
self.ket = self.cursor
|
|
self.bra = self.cursor
|
|
v_1 = self.limit - self.cursor
|
|
try:
|
|
v_2 = self.limit - self.cursor
|
|
try:
|
|
v_3 = self.limit - self.cursor
|
|
try:
|
|
if not self.eq_s_b(u"n"):
|
|
self.cursor = self.limit - v_3
|
|
raise lab2()
|
|
except lab2: pass
|
|
self.bra = self.cursor
|
|
if not self.eq_s_b(u"e"):
|
|
raise lab1()
|
|
raise lab0()
|
|
except lab1: pass
|
|
self.cursor = self.limit - v_2
|
|
v_4 = self.limit - self.cursor
|
|
try:
|
|
if not self.eq_s_b(u"n"):
|
|
self.cursor = self.limit - v_4
|
|
raise lab3()
|
|
except lab3: pass
|
|
v_5 = self.limit - self.cursor
|
|
try:
|
|
if not self.eq_s_b(u"j"):
|
|
self.cursor = self.limit - v_5
|
|
raise lab4()
|
|
except lab4: pass
|
|
self.bra = self.cursor
|
|
if not self.in_grouping_b(EsperantoStemmer.g_aou):
|
|
return False
|
|
except lab0: pass
|
|
if not self.eq_s_b(u"i"):
|
|
return False
|
|
v_6 = self.limit - self.cursor
|
|
try:
|
|
if self.find_among_b(EsperantoStemmer.a_8) == 0:
|
|
self.cursor = self.limit - v_6
|
|
raise lab5()
|
|
except lab5: pass
|
|
try:
|
|
v_7 = self.limit - self.cursor
|
|
try:
|
|
if self.cursor > self.limit_backward:
|
|
raise lab7()
|
|
raise lab6()
|
|
except lab7: pass
|
|
self.cursor = self.limit - v_7
|
|
if not self.eq_s_b(u"-"):
|
|
return False
|
|
except lab6: pass
|
|
self.cursor = self.limit - v_1
|
|
if not self.slice_del():
|
|
return False
|
|
|
|
return True
|
|
|
|
def __r_long_word(self):
|
|
try:
|
|
v_1 = self.limit - self.cursor
|
|
try:
|
|
for v_2 in 0, 0:
|
|
|
|
if not self.go_out_grouping_b(EsperantoStemmer.g_vowel):
|
|
raise lab1()
|
|
self.cursor -= 1
|
|
raise lab0()
|
|
except lab1: pass
|
|
self.cursor = self.limit - v_1
|
|
try:
|
|
try:
|
|
while True:
|
|
try:
|
|
if not self.eq_s_b(u"-"):
|
|
raise lab4()
|
|
raise lab3()
|
|
except lab4: pass
|
|
if self.cursor <= self.limit_backward:
|
|
raise lab2()
|
|
self.cursor -= 1
|
|
except lab3: pass
|
|
if self.cursor <= self.limit_backward:
|
|
raise lab2()
|
|
self.cursor -= 1
|
|
raise lab0()
|
|
except lab2: pass
|
|
self.cursor = self.limit - v_1
|
|
if not self.go_out_grouping_b(EsperantoStemmer.g_digit):
|
|
return False
|
|
self.cursor -= 1
|
|
except lab0: pass
|
|
return True
|
|
|
|
def __r_not_after_letter(self):
|
|
try:
|
|
v_1 = self.limit - self.cursor
|
|
try:
|
|
if not self.eq_s_b(u"-"):
|
|
raise lab1()
|
|
raise lab0()
|
|
except lab1: pass
|
|
self.cursor = self.limit - v_1
|
|
if not self.in_grouping_b(EsperantoStemmer.g_digit):
|
|
return False
|
|
except lab0: pass
|
|
return True
|
|
|
|
def __r_standard_suffix(self):
|
|
self.ket = self.cursor
|
|
if self.find_among_b(EsperantoStemmer.a_9) == 0:
|
|
return False
|
|
v_1 = self.limit - self.cursor
|
|
try:
|
|
if not self.eq_s_b(u"-"):
|
|
self.cursor = self.limit - v_1
|
|
raise lab0()
|
|
except lab0: pass
|
|
self.bra = self.cursor
|
|
if not self.slice_del():
|
|
return False
|
|
|
|
return True
|
|
|
|
def _stem(self):
|
|
v_1 = self.cursor
|
|
if not self.__r_canonical_form():
|
|
return False
|
|
self.cursor = v_1
|
|
v_2 = self.cursor
|
|
self.__r_initial_apostrophe()
|
|
self.cursor = v_2
|
|
self.limit_backward = self.cursor
|
|
self.cursor = self.limit
|
|
v_3 = self.limit - self.cursor
|
|
try:
|
|
if not self.__r_pronoun():
|
|
raise lab0()
|
|
return False
|
|
except lab0: pass
|
|
self.cursor = self.limit - v_3
|
|
v_4 = self.limit - self.cursor
|
|
self.__r_final_apostrophe()
|
|
self.cursor = self.limit - v_4
|
|
v_5 = self.limit - self.cursor
|
|
try:
|
|
if not self.__r_correlative():
|
|
raise lab1()
|
|
return False
|
|
except lab1: pass
|
|
self.cursor = self.limit - v_5
|
|
v_6 = self.limit - self.cursor
|
|
try:
|
|
if not self.__r_uninflected():
|
|
raise lab2()
|
|
return False
|
|
except lab2: pass
|
|
self.cursor = self.limit - v_6
|
|
v_7 = self.limit - self.cursor
|
|
try:
|
|
if not self.__r_merged_numeral():
|
|
raise lab3()
|
|
return False
|
|
except lab3: pass
|
|
self.cursor = self.limit - v_7
|
|
v_8 = self.limit - self.cursor
|
|
try:
|
|
if not self.__r_ujn_suffix():
|
|
raise lab4()
|
|
return False
|
|
except lab4: pass
|
|
self.cursor = self.limit - v_8
|
|
v_9 = self.limit - self.cursor
|
|
if not self.__r_long_word():
|
|
return False
|
|
self.cursor = self.limit - v_9
|
|
if not self.__r_standard_suffix():
|
|
return False
|
|
self.cursor = self.limit_backward
|
|
return True
|
|
|
|
a_0 = [
|
|
Among(u"", -1, 14),
|
|
Among(u"-", 0, 13),
|
|
Among(u"cx", 0, 1),
|
|
Among(u"gx", 0, 2),
|
|
Among(u"hx", 0, 3),
|
|
Among(u"jx", 0, 4),
|
|
Among(u"q", 0, 12),
|
|
Among(u"sx", 0, 5),
|
|
Among(u"ux", 0, 6),
|
|
Among(u"w", 0, 12),
|
|
Among(u"x", 0, 12),
|
|
Among(u"y", 0, 12),
|
|
Among(u"á", 0, 7),
|
|
Among(u"é", 0, 8),
|
|
Among(u"í", 0, 9),
|
|
Among(u"ó", 0, 10),
|
|
Among(u"ú", 0, 11)
|
|
]
|
|
|
|
a_1 = [
|
|
Among(u"as", -1, -1),
|
|
Among(u"i", -1, -1),
|
|
Among(u"is", 1, -1),
|
|
Among(u"os", -1, -1),
|
|
Among(u"u", -1, -1),
|
|
Among(u"us", 4, -1)
|
|
]
|
|
|
|
a_2 = [
|
|
Among(u"ci", -1, -1),
|
|
Among(u"gi", -1, -1),
|
|
Among(u"hi", -1, -1),
|
|
Among(u"li", -1, -1),
|
|
Among(u"ili", 3, -1),
|
|
Among(u"ŝli", 3, -1),
|
|
Among(u"mi", -1, -1),
|
|
Among(u"ni", -1, -1),
|
|
Among(u"oni", 7, -1),
|
|
Among(u"ri", -1, -1),
|
|
Among(u"si", -1, -1),
|
|
Among(u"vi", -1, -1),
|
|
Among(u"ivi", 11, -1),
|
|
Among(u"ĝi", -1, -1),
|
|
Among(u"ŝi", -1, -1),
|
|
Among(u"iŝi", 14, -1),
|
|
Among(u"malŝi", 14, -1)
|
|
]
|
|
|
|
a_3 = [
|
|
Among(u"amb", -1, -1),
|
|
Among(u"bald", -1, -1),
|
|
Among(u"malbald", 1, -1),
|
|
Among(u"morg", -1, -1),
|
|
Among(u"postmorg", 3, -1),
|
|
Among(u"adi", -1, -1),
|
|
Among(u"hodi", -1, -1),
|
|
Among(u"ank", -1, -1),
|
|
Among(u"ĉirk", -1, -1),
|
|
Among(u"tutĉirk", 8, -1),
|
|
Among(u"presk", -1, -1),
|
|
Among(u"almen", -1, -1),
|
|
Among(u"apen", -1, -1),
|
|
Among(u"hier", -1, -1),
|
|
Among(u"antaŭhier", 13, -1),
|
|
Among(u"malgr", -1, -1),
|
|
Among(u"ankor", -1, -1),
|
|
Among(u"kontr", -1, -1),
|
|
Among(u"anstat", -1, -1),
|
|
Among(u"kvaz", -1, -1)
|
|
]
|
|
|
|
a_4 = [
|
|
Among(u"aliu", -1, -1),
|
|
Among(u"unu", -1, -1)
|
|
]
|
|
|
|
a_5 = [
|
|
Among(u"aha", -1, -1),
|
|
Among(u"haha", 0, -1),
|
|
Among(u"haleluja", -1, -1),
|
|
Among(u"hola", -1, -1),
|
|
Among(u"hosana", -1, -1),
|
|
Among(u"maltra", -1, -1),
|
|
Among(u"hura", -1, -1),
|
|
Among(u"ĥaĥa", -1, -1),
|
|
Among(u"ekde", -1, -1),
|
|
Among(u"elde", -1, -1),
|
|
Among(u"disde", -1, -1),
|
|
Among(u"ehe", -1, -1),
|
|
Among(u"maltre", -1, -1),
|
|
Among(u"dirlididi", -1, -1),
|
|
Among(u"malpli", -1, -1),
|
|
Among(u"malĉi", -1, -1),
|
|
Among(u"malkaj", -1, -1),
|
|
Among(u"amen", -1, -1),
|
|
Among(u"tamen", 17, -1),
|
|
Among(u"oho", -1, -1),
|
|
Among(u"maltro", -1, -1),
|
|
Among(u"minus", -1, -1),
|
|
Among(u"uhu", -1, -1),
|
|
Among(u"muu", -1, -1)
|
|
]
|
|
|
|
a_6 = [
|
|
Among(u"tri", -1, -1),
|
|
Among(u"du", -1, -1),
|
|
Among(u"unu", -1, -1)
|
|
]
|
|
|
|
a_7 = [
|
|
Among(u"dek", -1, -1),
|
|
Among(u"cent", -1, -1)
|
|
]
|
|
|
|
a_8 = [
|
|
Among(u"k", -1, -1),
|
|
Among(u"kelk", 0, -1),
|
|
Among(u"nen", -1, -1),
|
|
Among(u"t", -1, -1),
|
|
Among(u"mult", 3, -1),
|
|
Among(u"samt", 3, -1),
|
|
Among(u"ĉ", -1, -1)
|
|
]
|
|
|
|
a_9 = [
|
|
Among(u"a", -1, -1),
|
|
Among(u"e", -1, -1),
|
|
Among(u"i", -1, -1),
|
|
Among(u"j", -1, -1, __r_not_after_letter),
|
|
Among(u"aj", 3, -1),
|
|
Among(u"oj", 3, -1),
|
|
Among(u"n", -1, -1, __r_not_after_letter),
|
|
Among(u"an", 6, -1),
|
|
Among(u"en", 6, -1),
|
|
Among(u"jn", 6, -1, __r_not_after_letter),
|
|
Among(u"ajn", 9, -1),
|
|
Among(u"ojn", 9, -1),
|
|
Among(u"on", 6, -1),
|
|
Among(u"o", -1, -1),
|
|
Among(u"as", -1, -1),
|
|
Among(u"is", -1, -1),
|
|
Among(u"os", -1, -1),
|
|
Among(u"us", -1, -1),
|
|
Among(u"u", -1, -1)
|
|
]
|
|
|
|
|
|
class lab0(BaseException): pass
|
|
|
|
|
|
class lab1(BaseException): pass
|
|
|
|
|
|
class lab2(BaseException): pass
|
|
|
|
|
|
class lab3(BaseException): pass
|
|
|
|
|
|
class lab4(BaseException): pass
|
|
|
|
|
|
class lab5(BaseException): pass
|
|
|
|
|
|
class lab6(BaseException): pass
|
|
|
|
|
|
class lab7(BaseException): pass
|