#-*-coding:utf-8-*-
#
# Simple stemmer for Croatian v0.2
# Copyright 2012 Nikola Ljubešić and Ivan Pandžić
# Copyright 2015 Stefan Koch
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re
import sys
stop = set ([ ' biti ' , ' jesam ' , ' budem ' , ' sam ' , ' jesi ' , ' budeš ' , ' si ' , ' jesmo ' , ' budemo ' ,
' smo ' , ' jeste ' , ' budete ' , ' ste ' , ' jesu ' , ' budu ' , ' su ' , ' bih ' , ' bijah ' , ' bjeh ' ,
' bijaše ' , ' bi ' , ' bje ' , ' bješe ' , ' bijasmo ' , ' bismo ' , ' bjesmo ' , ' bijaste ' , ' biste ' ,
' bjeste ' , ' bijahu ' , ' biste ' , ' bjeste ' , ' bijahu ' , ' bi ' , ' biše ' , ' bjehu ' , ' bješe ' ,
' bio ' , ' bili ' , ' budimo ' , ' budite ' , ' bila ' , ' bilo ' , ' bile ' , ' ću ' , ' ćeš ' , ' će ' ,
' ćemo ' , ' ćete ' , ' želim ' , ' želiš ' , ' želi ' , ' želimo ' , ' želite ' , ' žele ' , ' moram ' ,
' moraš ' , ' mora ' , ' moramo ' , ' morate ' , ' moraju ' , ' trebam ' , ' trebaš ' , ' treba ' ,
' trebamo ' , ' trebate ' , ' trebaju ' , ' mogu ' , ' možeš ' , ' može ' , ' možemo ' , ' možete ' ])
class CroatianStemmer ( object ):
def __init__ ( self ):
self . pravila = [ re . compile ( r ' ^( ' + osnova + ' )( ' + nastavak + r ' )$ ' ) for osnova , nastavak in [ e . strip (). split ( ' ' ) for e in open ( ' croatian_rules.txt ' )]]
self . transformacije = [ e . strip (). split ( ' \t ' ) for e in open ( ' transformations.txt ' )]
def stem ( self , token ):
if token . lower () in stop :
return token . lower ()
else :
return self . korjenuj ( self . transformiraj ( token . lower ()))
def istakniSlogotvornoR ( self , niz ):
return re . sub ( r ' (^|[^aeiou])r($|[^aeiou]) ' , r ' \1R\2 ' , niz )
def imaSamoglasnik ( self , niz ):
if re . search ( r ' [aeiouR] ' , self . istakniSlogotvornoR ( niz )) is None :
return False
else :
return True
def transformiraj ( self , pojavnica ):
for trazi , zamijeni in self . transformacije :
if pojavnica . endswith ( trazi ):
return pojavnica [: - len ( trazi )] + zamijeni
return pojavnica
def korjenuj ( self , pojavnica ):
for pravilo in self . pravila :
dioba = pravilo . match ( pojavnica )
if dioba is not None :
if self . imaSamoglasnik ( dioba . group ( 1 )) and len ( dioba . group ( 1 )) > 1 :
return dioba . group ( 1 )
return pojavnica