Rewriting University of Zagreb's Croatian Stemmer to a nltk-compliant class
The Faculty of Humanities and Social Sciences, University of Zagreb (FFZG) has published a croatian stemmer as a Python file. However, it is intended to be used as a command line script for stemming all words within a file.
For my project, I adjusted it to be an nltk-compliant version consisting of a class which has a method stem()
. Since the source code was published under LGPL, this is not a problem.
#-*-coding:utf-8-*-
#
# Simple stemmer for Croatian v0.2
# Copyright 2012 Nikola Ljubešić and Ivan Pandžić
# Copyright 2015 Stefan Koch
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re
import sys
stop=set(['biti','jesam','budem','sam','jesi','budeš','si','jesmo','budemo',
'smo','jeste','budete','ste','jesu','budu','su','bih','bijah','bjeh',
'bijaše','bi','bje','bješe','bijasmo','bismo','bjesmo','bijaste','biste',
'bjeste','bijahu','biste','bjeste','bijahu','bi','biše','bjehu','bješe',
'bio','bili','budimo','budite','bila','bilo','bile','ću','ćeš','će',
'ćemo','ćete','želim','želiš','želi','želimo','želite','žele','moram',
'moraš','mora','moramo','morate','moraju','trebam','trebaš','treba',
'trebamo','trebate','trebaju','mogu','možeš','može','možemo','možete'])
class CroatianStemmer(object):
def __init__(self):
self.pravila=[re.compile(r'^('+osnova+')('+nastavak+r')$') for osnova, nastavak in [e.strip().split(' ') for e in open('croatian_rules.txt')]]
self.transformacije=[e.strip().split('\t') for e in open('transformations.txt')]
def stem(self, token):
if token.lower() in stop:
return token.lower()
else:
return self.korjenuj(self.transformiraj(token.lower()))
def istakniSlogotvornoR(self, niz):
return re.sub(r'(^|[^aeiou])r($|[^aeiou])',r'\1R\2',niz)
def imaSamoglasnik(self, niz):
if re.search(r'[aeiouR]', self.istakniSlogotvornoR(niz)) is None:
return False
else:
return True
def transformiraj(self, pojavnica):
for trazi,zamijeni in self.transformacije:
if pojavnica.endswith(trazi):
return pojavnica[:-len(trazi)]+zamijeni
return pojavnica
def korjenuj(self, pojavnica):
for pravilo in self.pravila:
dioba=pravilo.match(pojavnica)
if dioba is not None:
if self.imaSamoglasnik(dioba.group(1)) and len(dioba.group(1))>1:
return dioba.group(1)
return pojavnica
Usage is then as follows:
stemmer = CroatianStemmer()
stemmer.stem('izgubio')