" Porter stemmer in VimL.
"
" Taken from:
" http://burakkanber.com/blog/machine-learning-full-text-search-in-javascript-relevance-scoring/
" Which referenced:
"  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
"  no. 3, pp 130-137,
"
" see also http://www.tartarus.org/~martin/PorterStemmer

let s:step2list = {
      \  "ational" : "ate"
      \, "tional"  : "tion"
      \, "enci"    : "ence"
      \, "anci"    : "ance"
      \, "izer"    : "ize"
      \, "bli"     : "ble"
      \, "alli"    : "al"
      \, "entli"   : "ent"
      \, "eli"     : "e"
      \, "ousli"   : "ous"
      \, "ization" : "ize"
      \, "ation"   : "ate"
      \, "ator"    : "ate"
      \, "alism"   : "al"
      \, "iveness" : "ive"
      \, "fulness" : "ful"
      \, "ousness" : "ous"
      \, "aliti"   : "al"
      \, "iviti"   : "ive"
      \, "biliti"  : "ble"
      \, "logi"    : "log"
      \}

let s:step3list = {
      \  "icate" : "ic"
      \, "ative" : ""
      \, "alize" : "al"
      \, "iciti" : "ic"
      \, "ical"  : "ic"
      \, "ful"   : ""
      \, "ness"  : ""
      \}

let s:c = "[^aeiou]"          " consonant
let s:v = "[aeiouy]"          " vowel
let s:C = s:c . "[^aeiouy]*"    " consonant sequence
let s:V = s:v . "[aeiou]*"      " vowel sequence

let s:mgr0 = '^\(' . s:C . '\)\?' . s:V . s:C                        " [C]VC... is m>0
let s:meq1 = '^\(' . s:C . '\)\?' . s:V . s:C . '\(' . s:V . '\)\?$' " [C]VC[V] is m=1
let s:mgr1 = '^\(' . s:C . '\)\?' . s:V . s:C . s:V . s:C            " [C]VCVC... is m>1
let s:s_v  = '^\(' . s:C . '\)\?' . s:v                              " vowel in stem

function! s:p(s)
  return
  echom string(a:s)
endfunction

function! ml#porter#stemmer(w)
  let w = a:w

  if len(w) < 3
    return w
  endif

  let firstch = w[0]
  if firstch == 'y'
    let w = 'Y' . w[1:]
  endif

  " Step 1a
  let re  = '^\(.\{-}\)\(ss\|i\)es$'
  let re2 = '^\(.\{-}\)\([^s]\)s$'

  if w =~ re
    let w = substitute(w, re, '\1\2', '')
  elseif w =~ re2
    let w = substitute(w, re2, '\1\2', '')
  endif

  call s:p(w)

  " Step 1b
  let re  = '^\(.\{-}\)eed$'
  let re2 = '^\(.\{-}\)\(ed\|ing\)$'

  if w =~ re
    let fp = matchlist(w, re)
    let re = s:mgr0
    if fp[1] =~ re
      let re = '.$'
      let w = substitute(w, re, '', '')
    endif
  elseif w =~ re2
    let fp = matchlist(w, re2)
    let stem = fp[1]
    let re2 = s:s_v
    if stem =~ re2
      let w = stem
      let re2 = '\(at\|bl\|iz\)$'
      let re3 = '\([^aeiouylsz]\)\1$'
      let re4 = '^' . s:C . s:v . '[^aeiouwxy]$'
      if w =~ re2
        let w = w . 'e'
      elseif w =~ re3
        let re = '.$'
        let w = substitute(w, re, '', '')
      elseif w =~ re4
        let w = w . 'e'
      endif
    endif
  endif

  " Step 1c
  let re = '^\(.\{-}\)y$'
  if w =~ re
    let fp = matchlist(w, re)
    let stem = fp[1]
    let re = s:s_v
    if stem =~ re
      let w = stem . 'i'
    endif
  endif

  " Step 2
  let re = '^\(.\{-}\)\(ational\|tional\|enci\|anci\|izer\|bli\|alli\|entli\|eli\|ousli\|ization\|ation\|ator\|alism\|iveness\|fulness\|ousness\|aliti\|iviti\|biliti\|logi\)$'
  if w =~ re
    let fp = matchlist(w, re)
    let stem = fp[1]
    let suffix = fp[2]
    let re = s:mgr0
    if stem =~ re
      let w = stem . s:step2list[suffix]
    endif
  endif

  " Step 3
  let re = '^\(.\{-}\)\(icate\|ative\|alize\|iciti\|ical\|ful\|ness\)$'
  if w =~ re
    let fp = matchlist(w, re)
    let stem = fp[1]
    let suffix = fp[2]
    let re = s:mgr0
    if stem =~ re
      let w = stem . s:step3list[suffix]
    endif
  endif

  " Step 4
  let re  = '^\(.\{-}\)\(al\|ance\|ence\|er\|ic\|able\|ible\|ant\|ement\|ment\|ent\|ou\|ism\|ate\|iti\|ous\|ive\|ize\)$'
  let re2 = '^\(.\{-}\)\(s\|t\)\(ion\)$'
  if w =~ re
    let fp = matchlist(w, re)
    let stem = fp[1]
    let re = s:mgr1
    if stem =~ re
      let w = stem
    endif
  elseif w =~ re2
    let fp = matchlist(w, re2)
    let stem = fp[1] . fp[2]
    let re2 = s:mgr1
    if stem =~ re2
      let w = stem
    endif
  endif

  " Step 5
  let re = '^\(.\{-}\)e$'
  if w =~ re
    let fp = matchlist(w, re)
    let stem = fp[1]
    let re  = s:mgr1
    let re2 = s:meq1
    let re3 = '^' . s:C . s:v . '[^aeiouwxy]$'
    if (stem =~ re || stem =~ re2) && stem !~ re3
      let w = stem
    endif
  endif

  let re  = 'll$'
  let re2 = s:mgr1
  if w =~ re && w =~ re2
    let re = '.$'
    let w = substitute(w, re, '', '')
  endif

  " and turn initial Y back to y

  if firstch == 'y'
    let w = 'y' . w[1:]
  endif

  return w
endfunction