fileset

git mirror of https://ccx.te2000.cz/bzr/fileset
git clone https://ccx.te2000.cz/git/fileset
Log | Files | Refs | README

commit fda4d61014bf57f50c4f6f6f542cf24b9a19ae59
parent 870d74697f374b90cf5ba3643cdb61c6cd572b3e
Author: Jan Pobrislo <ccx@webprojekty.cz>
Date:   Wed, 21 Nov 2018 16:48:47 +0100

New, faster fslist.py.
Diffstat:
Abin/fslist.py | 244+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 244 insertions(+), 0 deletions(-)

diff --git a/bin/fslist.py b/bin/fslist.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python2 +# vim: fileencoding=utf-8 ft=python et sw=4 ts=4 sts=4 tw=79 + +from __future__ import ( + generators, division, absolute_import, with_statement, print_function +) +import sys +import os.path +import argparse +import subprocess +import mmap + +from magic import mime_magic + + +parser = argparse.ArgumentParser() +parser.add_argument('--max-bin-size', type=int, default=4096) +parser.add_argument('--max-text-size', type=int, default=-1) +parser.add_argument('--max-line-length', type=int, default=160) + +parser.add_argument('--compact', action='store_true', default=False) +parser.add_argument('--no-compact', dest='compact', action='store_false') + +parser.add_argument('--content', action='store_true', default=True) +parser.add_argument('--no-content', dest='content', action='store_false') + +parser.add_argument('--owner', action='store_true', default=True) +parser.add_argument('--no-owner', dest='owner', action='store_false') + +parser.add_argument('--mode', action='store_true', default=True) +parser.add_argument('--no-mode', dest='mode', action='store_false') + +hash_utils = { + 'md5': ('md5sum',), + 'sha1': ('sha1sum',), + 'sha224': ('sha224sum',), + 'sha256': ('sha256sum',), + 'sha384': ('sha384sum',), + 'sha512': ('sha512sum',), + 'blake2': ('b2sum',), +} +parser.add_argument('--hash', default='sha512', + choices=tuple(hash_utils.keys())) + +bin_utils = { + 'b': ('base64',), + 'x': ('xxd',), +} +parser.add_argument('--bin-format', default='b', + choices=tuple(bin_utils.keys())) + +parser.add_argument('root', nargs='*') + + +ftypes = { # convert numeric type to mnemonic character + 0xC: 's', # socket + 0xA: 'l', # symbolic link + 0x8: 'f', # regular file + 0x6: 'b', # block device + 0x4: 'd', # directory + 0x2: 'u', # character device + 0x1: 'p', # FIFO +} + +SOCKET = 0xC +LINK = 0xA +FILE = 0x8 +BLOCK = 0x6 +DIR = 0x4 +CHAR = 0x2 +FIFO = 0x1 + + +class Output(object): + def __init__(self, fobj): + self.fobj = fobj + self.delim = '' + + def write(self, data): + self.fobj.write(data) + + def close(self): + self.fobj.close() + + def statement(self, text, delim=''): + # start on new line for multiline statements - more readable + if self.delim and '\n' in text: + self.delim = '\n' + self.write(self.delim) + self.write(text.replace('\n', '\n\t')) + self.delim = delim or '\t' + + def statement_end(self): + self.write('\n') + self.delim = '' + + def indent_copy(self, source, strip_last=True): + line = source.readline() + had_endl = False + while line: + if had_endl: + self.write('\t') + had_endl = line[-1] == '\n' + self.write(line) + line = source.readline() + if not had_endl: + self.write('\n') + elif not strip_last: + self.write('\t\n') + self.delim = '' + + +def hash_file(f, hash, out): + out.statement('s\t%s:%s' % ( + hash.upper(), + subprocess.check_output(hash_utils[hash], stdin=f).split(None, 1)[0], + )) + + +def process_content(args, f, size, out): + if not args.compact: + out.delim = '\n' + if mime_magic.descriptor(f.fileno()).startswith('text/'): + # text file + if args.max_text_size >= 0 and size > args.max_text_size: + hash_file(f, args.hash, out) + elif size > args.max_line_length: + # Always use the multiline form for larger files + mm = mmap.mmap(f.fileno(), 0, mmap.MAP_PRIVATE) + flags = ('n' if mm[-1] == '\n' else 'N') + out.statement('C%s\t' % flags, '') + out.indent_copy(mm) + else: + content = f.read() + flags = 'N' + if content and content[-1] == '\n': + content = content[:-1] + flags = 'n' + if args.compact and not ( + '\n' in content or '\t' in content + ): + out.statement('c%s\t%s' % (flags, content)) + else: + out.statement('C%s\t%s' % (flags, content), '\n') + else: + # binary file + if args.max_bin_size >= 0 and size > args.max_bin_size: + hash_file(f, args.hash, out) + else: + p = subprocess.Popen( + bin_utils[args.bin_format], + 4096, + stdin=f, + stdout=subprocess.PIPE, + ) + out.statement('%s\t' % args.bin_format.upper(), '') + out.indent_copy(p.stdout) + if p.wait(): + raise RuntimeError( + "calling %r on %r failed with %r" % ( + bin_utils[args.bin_format], + f.name, + p.returncode, + ) + ) + + +def process_file(args, relname, rel_to, s, out): + fname = os.path.join(rel_to, relname) + # s = os.lstat(fname) + ftype = s.st_mode >> 12 + mode = s.st_mode & 07777 + t = ftypes[ftype] + + if not args.compact: + out.write('\n') + + if '\t' in relname or '\n' in relname: + out.statement('P\t' + relname, '\t') + else: + if args.compact: + out.statement('/' + relname) + else: + out.statement('/' + relname, '\t') + + if ftype != DIR and s.st_nlink > 1: + pass # TODO: process hardlinks + + if ftype == BLOCK or ftype == CHAR: + out.statement('%s%d:%d' % (t, s.st_rdev >> 8, s.st_rdev & 255)) + elif ftype == LINK: + out.statement('l\t' + os.readlink(fname), '\t') + elif ftype == FILE: + if args.content: + with open(fname, 'rb') as f: + process_content(args, f, s.st_size, out) + else: + out.statement('f') + else: + out.statement(t) + + if args.owner: + out.statement('o%d:%d' % (s.st_uid, s.st_gid)) + + if args.mode: + out.statement('m%04o' % mode,) + + out.statement_end() + + +def statwalk(root, top='', sort_func=sorted, exclude=None): + names = os.listdir(os.path.join(root, top)) + stats = [os.lstat(os.path.join(root, top, name)) for name in names] + + for (name, s) in sort_func(zip(names, stats)): + relname = os.path.join(top, name) + if exclude is not None and exclude(root, top, name, s): + continue + yield (relname, s) + if DIR == s.st_mode >> 12: + # We don't have readdir() and directory filedescriptors available + # so we will keep just appending to the path. + for x in statwalk(root, relname): + yield x + + +def process_root(args, root, out): + for relname, s in statwalk(root): + process_file(args, relname, root, s, out) + + +def main(args): + out = Output(sys.stdout) + + if not args.root: + process_root(args, '.', out) + else: + for root in args.root: + process_root(args, root, out) + + +if __name__ == '__main__': + args = parser.parse_args() + main(args)