function! string#scanner(str)
let obj = {}
if type(a:str) == type([])
let obj.string = join(a:str, "\n")
else
let obj.string = a:str
endif
let obj.length = len(obj.string)
let obj.index = 0
func obj.eos() dict
return self.index >= self.length
endfunc
func obj.inject(str)
let self.string = strpart(self.string, 0, self.index)
\ . a:str . strpart(self.string, self.index)
let self.length = len(self.string)
return self
endfunc
func obj.skip(pat) dict
let pos = matchend(self.string, '\_^' . a:pat, self.index)
if pos != -1
let self.index = pos
endif
return pos
endfunc
func obj.skip_until(pat) dict
let pos = matchend(self.string, '\_.\{-}\ze' . a:pat, self.index)
if pos != -1
let self.index = pos
endif
return pos
endfunc
func obj.scan(pat) dict
" Use \_^ here to anchor the match at the start of the index.
" Otherwise it finds the first match after index.
let m = matchlist(self.string, '\_^' . a:pat, self.index)
if ! empty(m)
let self.index += len(m[0])
let self.matches = m
return m[0]
endif
return ""
endfunc
func obj.collect(pat) dict
let matches = []
while ! self.eos()
if self.skip_until(a:pat) == -1
break
endif
call add(matches, self.scan(a:pat))
endwhile
return matches
endfunc
func obj.split(sep, ...) dict
let keepsep = 0
if a:0
let keepsep = a:1
endif
let pieces = []
let old_index = 0
while ! self.eos()
if self.skip_until(a:sep) == -1
call add(pieces, strpart(self.string, old_index))
break
endif
let the_piece = strpart(self.string, old_index, (self.index - old_index))
call add(pieces, the_piece)
let the_sep = self.scan(a:sep)
if keepsep && (the_sep != '')
call add(pieces, the_sep)
endif
if old_index == self.index
call add(pieces, strpart(self.string, old_index, 1))
let self.index += 1
endif
let old_index = self.index
endwhile
return pieces
endfunc
return obj
endfunction
" A list of tokens with navigation methods & element access
function! string#tokens()
let obj = {}
let obj.tokens = []
let obj.index = 0
let obj.cur_tok = []
let obj.next_tok = []
"foo
func obj.finalise()
call add(self.tokens, ['_end_', '_end_', self.tokens[-1][-1]])
let self.num_tokens = len(self.tokens)
let self.next_tok = self.tokens[0]
return self
endfunc
func obj.next()
let self.cur_tok = self.next_tok
if self.index < self.num_tokens
let self.index += 1
endif
let self.next_tok = self.tokens[self.index]
return self.cur_tok
endfunc
func obj.add(type, value, line)
call add(self.tokens, [a:type, a:value, a:line])
endfunc
return obj
endfunction
function! string#lexer(string)
let obj = {}
let obj.tokens = string#tokens()
let obj.string = ''
let obj.line_continuation_pattern = '\n\s*\\'
let obj.pattern_order = [
\ 'whitespace', 'name'
\, 'float_number', 'hex_number', 'oct_number', 'int_number'
\, 'tq_string', 'dq_string', 'sq_string'
\, 'operator', 'comment', 'unknown'
\]
let obj.newline_patterns = [
\ 'whitespace'
\, 'tq_string', 'dq_string', 'sq_string'
\, 'comment', 'unknown'
\]
let obj.patterns = {
\ 'whitespace' : ['\s\+', '\n\%(\s*\\\s*\)\?']
\, 'name' : ['[ablgstw]:\w*', '[_a-zA-Z]\+']
\, 'float_number' : ['\d\+\.\d\+\%([eE][+-]\?\d\+\)\?']
\, 'hex_number' : ['0x\x\+']
\, 'oct_number' : ['0\o\+']
\, 'int_number' : ['\d\+']
\, 'tq_string' : ['"""\_.\{-}"""']
\, 'dq_string' : ['"\%(\\\.\|[^\n]\)*"']
\, 'sq_string' : ['''\%(''''\|\_.\)\{-}''']
\, 'operator' : ['[\\\[\](){}<>:,./\\?=+!@#$%^&*`~|-]\+']
\, 'comment' : ['"[^\n]*\n']
\, 'unknown' : ['\S\+']
\}
func obj.new(str)
let self.tokens = string#tokens()
if type(a:str) == type([])
let self.string = join(a:str, "\n")
else
let self.string = a:str
endif
let self.ss = string#scanner(self.string . "\n")
call self.lex()
let self.tokens = self.tokens.finalise()
return self
endfunc
func obj.join_line_continuations(string)
return substitute(a:string, self.line_continuation_pattern, '', 'g')
endfunc
func obj.lex()
let lines = 1
while self.ss.index < self.ss.length
let matched = 0
for type in self.pattern_order
for pat in self.patterns[type]
let value = self.ss.scan(pat)
if value != ''
let matched = 1
let t_value = value
if index(self.newline_patterns, type) != -1
let value = self.join_line_continuations(value)
endif
call self.tokens.add(type, value, lines)
if index(self.newline_patterns, type) != -1
let lines += len(substitute(t_value, '[^\n]', '', 'g'))
endif
break
endif
endfor
if matched
break
endif
endfor
endwhile
endfunc
return obj.new(a:string)
endfunction
let s:stops = map(
\ ["a" , "about" , "above" , "after" , "again" , "against" , "all" , "am" , "an" , "and" , "any" , "are" , "aren't" , "as" , "at" , "be" , "because" , "been" , "before" , "being" , "below" , "between" , "both" , "but" , "by" , "can't" , "cannot" , "could" , "couldn't" , "did" , "didn't" , "do" , "does" , "doesn't" , "doing" , "don't" , "down" , "during" , "each" , "few" , "for" , "from" , "further" , "had" , "hadn't" , "has" , "hasn't" , "have" , "haven't" , "having" , "he" , "he'd" , "he'll" , "he's" , "her" , "here" , "here's" , "hers" , "herself" , "him" , "himself" , "his" , "how" , "how's" , "i" , "i'd" , "i'll" , "i'm" , "i've" , "if" , "in" , "into" , "is" , "isn't" , "it" , "it's" , "its" , "itself" , "let's" , "me" , "more" , "most" , "mustn't" , "my" , "myself" , "no" , "nor" , "not" , "of" , "off" , "on" , "once" , "only" , "or" , "other" , "ought" , "our" , "ours" , "ourselves" , "out" , "over" , "own" , "same" , "shan't" , "she" , "she'd" , "she'll" , "she's" , "should" , "shouldn't" , "so" , "some" , "such" , "than" , "that" , "that's" , "the" , "their" , "theirs" , "them" , "themselves" , "then" , "there" , "there's" , "these" , "they" , "they'd" , "they'll" , "they're" , "they've" , "this" , "those" , "through" , "to" , "too" , "under" , "until" , "up" , "very" , "was" , "wasn't" , "we" , "we'd" , "we'll" , "we're" , "we've" , "were" , "weren't" , "what" , "what's" , "when" , "when's" , "where" , "where's" , "which" , "while" , "who" , "who's" , "whom" , "why" , "why's" , "with" , "won't" , "would" , "wouldn't" , "you" , "you'd" , "you'll" , "you're" , "you've" , "your" , "yours" , "yourself" , "yourselves"]
\, 'ml#porter#stemmer(v:val)')
function! string#tokenize(text)
let t = (type(a:text) == type([]) ? join(a:text, ' ') : a:text)
let text = map(
\ split(
\ substitute(
\ substitute(
\ substitute(tolower(t)
\ , '\W', ' ', 'g')
\ , '\s\+', ' ', 'g')
\ , '^\s*\(.\{-}\)\s*$', '\1', '')
\ , ' ')
\, 'ml#porter#stemmer(v:val)')
" Filter out stops
let out = []
for word in text
if index(s:stops, word) == -1
call add(out, word)
endif
endfor
return out
endfunction
function! string#trim(str)
return matchstr(a:str, '^\_s*\zs.\{-}\ze\_s*$')
endfunction
function! string#to_string(obj)
let obj = a:obj
if type(obj) < 2
return obj
else
return string(obj)
endif
endfunction
function! string#eval(line)
let line = string#trim(a:line)
if line[0] =~ '[{[]'
return eval(line)
else
return line
endif
endfunction
" range(number) - ['A' .. 'A'+number]
" range(65, 90) - ['a' .. 'z']
" range('a', 'f') - ['a' .. 'f']
" range('A', 6) - ['A' .. 'F']
function! string#range(...)
if ! a:0
throw 'vimple string#range: not enough arguments'
endif
if a:0 > 2
throw 'vimple string#range: too many arguments'
endif
if a:0 == 1
return map(range(a:1), 'nr2char(char2nr("A")+v:val)')
else
if type(a:1) == type(0)
let start = a:1
else
let start = char2nr(a:1)
endif
if type(a:2) == type(0)
if type(a:1) == type(0)
let end = a:2
else
let end = (start + a:2) - 1
endif
else
let end = char2nr(a:2)
endif
return map(range(start, end), 'nr2char(v:val)')
endif
endfunction
" returns a dict of {word : count}
function! string#words(text)
let words = {}
for w in split(a:text)
let words[w] = get(words, w, 0) + 1
endfor
return words
endfunction