function! string#scanner(str) let obj = {} if type(a:str) == type([]) let obj.string = join(a:str, "\n") else let obj.string = a:str endif let obj.length = len(obj.string) let obj.index = 0 func obj.eos() dict return self.index >= self.length endfunc func obj.inject(str) let self.string = strpart(self.string, 0, self.index) \ . a:str . strpart(self.string, self.index) let self.length = len(self.string) return self endfunc func obj.skip(pat) dict let pos = matchend(self.string, '\_^' . a:pat, self.index) if pos != -1 let self.index = pos endif return pos endfunc func obj.skip_until(pat) dict let pos = matchend(self.string, '\_.\{-}\ze' . a:pat, self.index) if pos != -1 let self.index = pos endif return pos endfunc func obj.scan(pat) dict " Use \_^ here to anchor the match at the start of the index. " Otherwise it finds the first match after index. let m = matchlist(self.string, '\_^' . a:pat, self.index) if ! empty(m) let self.index += len(m[0]) let self.matches = m return m[0] endif return "" endfunc func obj.collect(pat) dict let matches = [] while ! self.eos() if self.skip_until(a:pat) == -1 break endif call add(matches, self.scan(a:pat)) endwhile return matches endfunc func obj.split(sep, ...) dict let keepsep = 0 if a:0 let keepsep = a:1 endif let pieces = [] let old_index = 0 while ! self.eos() if self.skip_until(a:sep) == -1 call add(pieces, strpart(self.string, old_index)) break endif let the_piece = strpart(self.string, old_index, (self.index - old_index)) call add(pieces, the_piece) let the_sep = self.scan(a:sep) if keepsep && (the_sep != '') call add(pieces, the_sep) endif if old_index == self.index call add(pieces, strpart(self.string, old_index, 1)) let self.index += 1 endif let old_index = self.index endwhile return pieces endfunc return obj endfunction " A list of tokens with navigation methods & element access function! string#tokens() let obj = {} let obj.tokens = [] let obj.index = 0 let obj.cur_tok = [] let obj.next_tok = [] "foo func obj.finalise() call add(self.tokens, ['_end_', '_end_', self.tokens[-1][-1]]) let self.num_tokens = len(self.tokens) let self.next_tok = self.tokens[0] return self endfunc func obj.next() let self.cur_tok = self.next_tok if self.index < self.num_tokens let self.index += 1 endif let self.next_tok = self.tokens[self.index] return self.cur_tok endfunc func obj.add(type, value, line) call add(self.tokens, [a:type, a:value, a:line]) endfunc return obj endfunction function! string#lexer(string) let obj = {} let obj.tokens = string#tokens() let obj.string = '' let obj.line_continuation_pattern = '\n\s*\\' let obj.pattern_order = [ \ 'whitespace', 'name' \, 'float_number', 'hex_number', 'oct_number', 'int_number' \, 'tq_string', 'dq_string', 'sq_string' \, 'operator', 'comment', 'unknown' \] let obj.newline_patterns = [ \ 'whitespace' \, 'tq_string', 'dq_string', 'sq_string' \, 'comment', 'unknown' \] let obj.patterns = { \ 'whitespace' : ['\s\+', '\n\%(\s*\\\s*\)\?'] \, 'name' : ['[ablgstw]:\w*', '[_a-zA-Z]\+'] \, 'float_number' : ['\d\+\.\d\+\%([eE][+-]\?\d\+\)\?'] \, 'hex_number' : ['0x\x\+'] \, 'oct_number' : ['0\o\+'] \, 'int_number' : ['\d\+'] \, 'tq_string' : ['"""\_.\{-}"""'] \, 'dq_string' : ['"\%(\\\.\|[^\n]\)*"'] \, 'sq_string' : ['''\%(''''\|\_.\)\{-}'''] \, 'operator' : ['[\\\[\](){}<>:,./\\?=+!@#$%^&*`~|-]\+'] \, 'comment' : ['"[^\n]*\n'] \, 'unknown' : ['\S\+'] \} func obj.new(str) let self.tokens = string#tokens() if type(a:str) == type([]) let self.string = join(a:str, "\n") else let self.string = a:str endif let self.ss = string#scanner(self.string . "\n") call self.lex() let self.tokens = self.tokens.finalise() return self endfunc func obj.join_line_continuations(string) return substitute(a:string, self.line_continuation_pattern, '', 'g') endfunc func obj.lex() let lines = 1 while self.ss.index < self.ss.length let matched = 0 for type in self.pattern_order for pat in self.patterns[type] let value = self.ss.scan(pat) if value != '' let matched = 1 let t_value = value if index(self.newline_patterns, type) != -1 let value = self.join_line_continuations(value) endif call self.tokens.add(type, value, lines) if index(self.newline_patterns, type) != -1 let lines += len(substitute(t_value, '[^\n]', '', 'g')) endif break endif endfor if matched break endif endfor endwhile endfunc return obj.new(a:string) endfunction let s:stops = map( \ ["a" , "about" , "above" , "after" , "again" , "against" , "all" , "am" , "an" , "and" , "any" , "are" , "aren't" , "as" , "at" , "be" , "because" , "been" , "before" , "being" , "below" , "between" , "both" , "but" , "by" , "can't" , "cannot" , "could" , "couldn't" , "did" , "didn't" , "do" , "does" , "doesn't" , "doing" , "don't" , "down" , "during" , "each" , "few" , "for" , "from" , "further" , "had" , "hadn't" , "has" , "hasn't" , "have" , "haven't" , "having" , "he" , "he'd" , "he'll" , "he's" , "her" , "here" , "here's" , "hers" , "herself" , "him" , "himself" , "his" , "how" , "how's" , "i" , "i'd" , "i'll" , "i'm" , "i've" , "if" , "in" , "into" , "is" , "isn't" , "it" , "it's" , "its" , "itself" , "let's" , "me" , "more" , "most" , "mustn't" , "my" , "myself" , "no" , "nor" , "not" , "of" , "off" , "on" , "once" , "only" , "or" , "other" , "ought" , "our" , "ours" , "ourselves" , "out" , "over" , "own" , "same" , "shan't" , "she" , "she'd" , "she'll" , "she's" , "should" , "shouldn't" , "so" , "some" , "such" , "than" , "that" , "that's" , "the" , "their" , "theirs" , "them" , "themselves" , "then" , "there" , "there's" , "these" , "they" , "they'd" , "they'll" , "they're" , "they've" , "this" , "those" , "through" , "to" , "too" , "under" , "until" , "up" , "very" , "was" , "wasn't" , "we" , "we'd" , "we'll" , "we're" , "we've" , "were" , "weren't" , "what" , "what's" , "when" , "when's" , "where" , "where's" , "which" , "while" , "who" , "who's" , "whom" , "why" , "why's" , "with" , "won't" , "would" , "wouldn't" , "you" , "you'd" , "you'll" , "you're" , "you've" , "your" , "yours" , "yourself" , "yourselves"] \, 'ml#porter#stemmer(v:val)') function! string#tokenize(text) let t = (type(a:text) == type([]) ? join(a:text, ' ') : a:text) let text = map( \ split( \ substitute( \ substitute( \ substitute(tolower(t) \ , '\W', ' ', 'g') \ , '\s\+', ' ', 'g') \ , '^\s*\(.\{-}\)\s*$', '\1', '') \ , ' ') \, 'ml#porter#stemmer(v:val)') " Filter out stops let out = [] for word in text if index(s:stops, word) == -1 call add(out, word) endif endfor return out endfunction function! string#trim(str) return matchstr(a:str, '^\_s*\zs.\{-}\ze\_s*$') endfunction function! string#to_string(obj) let obj = a:obj if type(obj) < 2 return obj else return string(obj) endif endfunction function! string#eval(line) let line = string#trim(a:line) if line[0] =~ '[{[]' return eval(line) else return line endif endfunction " range(number) - ['A' .. 'A'+number] " range(65, 90) - ['a' .. 'z'] " range('a', 'f') - ['a' .. 'f'] " range('A', 6) - ['A' .. 'F'] function! string#range(...) if ! a:0 throw 'vimple string#range: not enough arguments' endif if a:0 > 2 throw 'vimple string#range: too many arguments' endif if a:0 == 1 return map(range(a:1), 'nr2char(char2nr("A")+v:val)') else if type(a:1) == type(0) let start = a:1 else let start = char2nr(a:1) endif if type(a:2) == type(0) if type(a:1) == type(0) let end = a:2 else let end = (start + a:2) - 1 endif else let end = char2nr(a:2) endif return map(range(start, end), 'nr2char(v:val)') endif endfunction " returns a dict of {word : count} function! string#words(text) let words = {} for w in split(a:text) let words[w] = get(words, w, 0) + 1 endfor return words endfunction