" Porter stemmer in VimL.
"
" Taken from:
" http://burakkanber.com/blog/machine-learning-full-text-search-in-javascript-relevance-scoring/
" Which referenced:
" Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
" no. 3, pp 130-137,
"
" see also http://www.tartarus.org/~martin/PorterStemmer
let s:step2list = {
\ "ational" : "ate"
\, "tional" : "tion"
\, "enci" : "ence"
\, "anci" : "ance"
\, "izer" : "ize"
\, "bli" : "ble"
\, "alli" : "al"
\, "entli" : "ent"
\, "eli" : "e"
\, "ousli" : "ous"
\, "ization" : "ize"
\, "ation" : "ate"
\, "ator" : "ate"
\, "alism" : "al"
\, "iveness" : "ive"
\, "fulness" : "ful"
\, "ousness" : "ous"
\, "aliti" : "al"
\, "iviti" : "ive"
\, "biliti" : "ble"
\, "logi" : "log"
\}
let s:step3list = {
\ "icate" : "ic"
\, "ative" : ""
\, "alize" : "al"
\, "iciti" : "ic"
\, "ical" : "ic"
\, "ful" : ""
\, "ness" : ""
\}
let s:c = "[^aeiou]" " consonant
let s:v = "[aeiouy]" " vowel
let s:C = s:c . "[^aeiouy]*" " consonant sequence
let s:V = s:v . "[aeiou]*" " vowel sequence
let s:mgr0 = '^\(' . s:C . '\)\?' . s:V . s:C " [C]VC... is m>0
let s:meq1 = '^\(' . s:C . '\)\?' . s:V . s:C . '\(' . s:V . '\)\?$' " [C]VC[V] is m=1
let s:mgr1 = '^\(' . s:C . '\)\?' . s:V . s:C . s:V . s:C " [C]VCVC... is m>1
let s:s_v = '^\(' . s:C . '\)\?' . s:v " vowel in stem
function! s:p(s)
return
echom string(a:s)
endfunction
function! ml#porter#stemmer(w)
let w = a:w
if len(w) < 3
return w
endif
let firstch = w[0]
if firstch == 'y'
let w = 'Y' . w[1:]
endif
" Step 1a
let re = '^\(.\{-}\)\(ss\|i\)es$'
let re2 = '^\(.\{-}\)\([^s]\)s$'
if w =~ re
let w = substitute(w, re, '\1\2', '')
elseif w =~ re2
let w = substitute(w, re2, '\1\2', '')
endif
call s:p(w)
" Step 1b
let re = '^\(.\{-}\)eed$'
let re2 = '^\(.\{-}\)\(ed\|ing\)$'
if w =~ re
let fp = matchlist(w, re)
let re = s:mgr0
if fp[1] =~ re
let re = '.$'
let w = substitute(w, re, '', '')
endif
elseif w =~ re2
let fp = matchlist(w, re2)
let stem = fp[1]
let re2 = s:s_v
if stem =~ re2
let w = stem
let re2 = '\(at\|bl\|iz\)$'
let re3 = '\([^aeiouylsz]\)\1$'
let re4 = '^' . s:C . s:v . '[^aeiouwxy]$'
if w =~ re2
let w = w . 'e'
elseif w =~ re3
let re = '.$'
let w = substitute(w, re, '', '')
elseif w =~ re4
let w = w . 'e'
endif
endif
endif
" Step 1c
let re = '^\(.\{-}\)y$'
if w =~ re
let fp = matchlist(w, re)
let stem = fp[1]
let re = s:s_v
if stem =~ re
let w = stem . 'i'
endif
endif
" Step 2
let re = '^\(.\{-}\)\(ational\|tional\|enci\|anci\|izer\|bli\|alli\|entli\|eli\|ousli\|ization\|ation\|ator\|alism\|iveness\|fulness\|ousness\|aliti\|iviti\|biliti\|logi\)$'
if w =~ re
let fp = matchlist(w, re)
let stem = fp[1]
let suffix = fp[2]
let re = s:mgr0
if stem =~ re
let w = stem . s:step2list[suffix]
endif
endif
" Step 3
let re = '^\(.\{-}\)\(icate\|ative\|alize\|iciti\|ical\|ful\|ness\)$'
if w =~ re
let fp = matchlist(w, re)
let stem = fp[1]
let suffix = fp[2]
let re = s:mgr0
if stem =~ re
let w = stem . s:step3list[suffix]
endif
endif
" Step 4
let re = '^\(.\{-}\)\(al\|ance\|ence\|er\|ic\|able\|ible\|ant\|ement\|ment\|ent\|ou\|ism\|ate\|iti\|ous\|ive\|ize\)$'
let re2 = '^\(.\{-}\)\(s\|t\)\(ion\)$'
if w =~ re
let fp = matchlist(w, re)
let stem = fp[1]
let re = s:mgr1
if stem =~ re
let w = stem
endif
elseif w =~ re2
let fp = matchlist(w, re2)
let stem = fp[1] . fp[2]
let re2 = s:mgr1
if stem =~ re2
let w = stem
endif
endif
" Step 5
let re = '^\(.\{-}\)e$'
if w =~ re
let fp = matchlist(w, re)
let stem = fp[1]
let re = s:mgr1
let re2 = s:meq1
let re3 = '^' . s:C . s:v . '[^aeiouwxy]$'
if (stem =~ re || stem =~ re2) && stem !~ re3
let w = stem
endif
endif
let re = 'll$'
let re2 = s:mgr1
if w =~ re && w =~ re2
let re = '.$'
let w = substitute(w, re, '', '')
endif
" and turn initial Y back to y
if firstch == 'y'
let w = 'y' . w[1:]
endif
return w
endfunction