Rewriting University of Zagreb's Croatian Stemmer to a nltk-compliant class

The Faculty of Humanities and Social Sciences, University of Zagreb (FFZG) has published a croatian stemmer as a Python file. However, it is intended to be used as a command line script for stemming all words within a file.

For my project, I adjusted it to be an nltk-compliant version consisting of a class which has a method stem(). Since the source code was published under LGPL, this is not a problem.

#-*-coding:utf-8-*-
#
#    Simple stemmer for Croatian v0.2
#    Copyright 2012 Nikola Ljubešić and Ivan Pandžić
#    Copyright 2015 Stefan Koch
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Lesser General Public License as published
#    by the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#    GNU Lesser General Public License for more details.
#
#    You should have received a copy of the GNU Lesser General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

import re
import sys
stop=set(['biti','jesam','budem','sam','jesi','budeš','si','jesmo','budemo',
   'smo','jeste','budete','ste','jesu','budu','su','bih','bijah','bjeh',
   'bijaše','bi','bje','bješe','bijasmo','bismo','bjesmo','bijaste','biste',
   'bjeste','bijahu','biste','bjeste','bijahu','bi','biše','bjehu','bješe',
   'bio','bili','budimo','budite','bila','bilo','bile','ću','ćeš','će',
   'ćemo','ćete','želim','želiš','želi','želimo','želite','žele','moram',
   'moraš','mora','moramo','morate','moraju','trebam','trebaš','treba',
   'trebamo','trebate','trebaju','mogu','možeš','može','možemo','možete'])

class CroatianStemmer(object):
    def __init__(self):
        self.pravila=[re.compile(r'^('+osnova+')('+nastavak+r')$') for osnova, nastavak in [e.strip().split(' ') for e in open('croatian_rules.txt')]]
        self.transformacije=[e.strip().split('\t') for e in open('transformations.txt')]

    def stem(self, token):
        if token.lower() in stop:
            return token.lower()
        else:
            return self.korjenuj(self.transformiraj(token.lower()))

    def istakniSlogotvornoR(self, niz):
        return re.sub(r'(^|[^aeiou])r($|[^aeiou])',r'\1R\2',niz)

    def imaSamoglasnik(self, niz):
        if re.search(r'[aeiouR]', self.istakniSlogotvornoR(niz)) is None:
            return False
        else:
            return True

    def transformiraj(self, pojavnica):
        for trazi,zamijeni in self.transformacije:
            if pojavnica.endswith(trazi):
                return pojavnica[:-len(trazi)]+zamijeni
        return pojavnica

    def korjenuj(self, pojavnica):
        for pravilo in self.pravila:
            dioba=pravilo.match(pojavnica)
            if dioba is not None:
                if self.imaSamoglasnik(dioba.group(1)) and len(dioba.group(1))>1:
                    return dioba.group(1)
        return pojavnica

Usage is then as follows:

stemmer = CroatianStemmer()
stemmer.stem('izgubio')

I do not maintain a comments section. If you have any questions or comments regarding my posts, please do not hesitate to send me an e-mail to blog@stefan-koch.name.