New, faster fslist.py. - fileset - git mirror of https://ccx.te2000.cz/bzr/fileset

commit fda4d61014bf57f50c4f6f6f542cf24b9a19ae59
parent 870d74697f374b90cf5ba3643cdb61c6cd572b3e
Author: Jan Pobrislo <ccx@webprojekty.cz>
Date:   Wed, 21 Nov 2018 16:48:47 +0100

New, faster fslist.py.
Diffstat:
A bin/fslist.py  | 244 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 244 insertions(+), 0 deletions(-)
diff --git a/bin/fslist.py b/bin/fslist.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python2
+# vim: fileencoding=utf-8 ft=python et sw=4 ts=4 sts=4 tw=79
+
+from __future__ import (
+    generators, division, absolute_import, with_statement, print_function
+)
+import sys
+import os.path
+import argparse
+import subprocess
+import mmap
+
+from magic import mime_magic
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--max-bin-size', type=int, default=4096)
+parser.add_argument('--max-text-size', type=int, default=-1)
+parser.add_argument('--max-line-length', type=int, default=160)
+
+parser.add_argument('--compact', action='store_true', default=False)
+parser.add_argument('--no-compact', dest='compact', action='store_false')
+
+parser.add_argument('--content', action='store_true', default=True)
+parser.add_argument('--no-content', dest='content', action='store_false')
+
+parser.add_argument('--owner', action='store_true', default=True)
+parser.add_argument('--no-owner', dest='owner', action='store_false')
+
+parser.add_argument('--mode', action='store_true', default=True)
+parser.add_argument('--no-mode', dest='mode', action='store_false')
+
+hash_utils = {
+    'md5': ('md5sum',),
+    'sha1': ('sha1sum',),
+    'sha224': ('sha224sum',),
+    'sha256': ('sha256sum',),
+    'sha384': ('sha384sum',),
+    'sha512': ('sha512sum',),
+    'blake2': ('b2sum',),
+}
+parser.add_argument('--hash', default='sha512',
+                    choices=tuple(hash_utils.keys()))
+
+bin_utils = {
+    'b': ('base64',),
+    'x': ('xxd',),
+}
+parser.add_argument('--bin-format', default='b',
+                    choices=tuple(bin_utils.keys()))
+
+parser.add_argument('root', nargs='*')
+
+
+ftypes = {  # convert numeric type to mnemonic character
+    0xC: 's',  # socket
+    0xA: 'l',  # symbolic link
+    0x8: 'f',  # regular file
+    0x6: 'b',  # block device
+    0x4: 'd',  # directory
+    0x2: 'u',  # character device
+    0x1: 'p',  # FIFO
+}
+
+SOCKET = 0xC
+LINK = 0xA
+FILE = 0x8
+BLOCK = 0x6
+DIR = 0x4
+CHAR = 0x2
+FIFO = 0x1
+
+
+class Output(object):
+    def __init__(self, fobj):
+        self.fobj = fobj
+        self.delim = ''
+
+    def write(self, data):
+        self.fobj.write(data)
+
+    def close(self):
+        self.fobj.close()
+
+    def statement(self, text, delim=''):
+        # start on new line for multiline statements - more readable
+        if self.delim and '\n' in text:
+            self.delim = '\n'
+        self.write(self.delim)
+        self.write(text.replace('\n', '\n\t'))
+        self.delim = delim or '\t'
+
+    def statement_end(self):
+        self.write('\n')
+        self.delim = ''
+
+    def indent_copy(self, source, strip_last=True):
+        line = source.readline()
+        had_endl = False
+        while line:
+            if had_endl:
+                self.write('\t')
+            had_endl = line[-1] == '\n'
+            self.write(line)
+            line = source.readline()
+        if not had_endl:
+            self.write('\n')
+        elif not strip_last:
+            self.write('\t\n')
+        self.delim = ''
+
+
+def hash_file(f, hash, out):
+    out.statement('s\t%s:%s' % (
+        hash.upper(),
+        subprocess.check_output(hash_utils[hash], stdin=f).split(None, 1)[0],
+    ))
+
+
+def process_content(args, f, size, out):
+    if not args.compact:
+        out.delim = '\n'
+    if mime_magic.descriptor(f.fileno()).startswith('text/'):
+        # text file
+        if args.max_text_size >= 0 and size > args.max_text_size:
+            hash_file(f, args.hash, out)
+        elif size > args.max_line_length:
+            # Always use the multiline form for larger files
+            mm = mmap.mmap(f.fileno(), 0, mmap.MAP_PRIVATE)
+            flags = ('n' if mm[-1] == '\n' else 'N')
+            out.statement('C%s\t' % flags, '')
+            out.indent_copy(mm)
+        else:
+            content = f.read()
+            flags = 'N'
+            if content and content[-1] == '\n':
+                content = content[:-1]
+                flags = 'n'
+            if args.compact and not (
+                    '\n' in content or '\t' in content
+            ):
+                out.statement('c%s\t%s' % (flags, content))
+            else:
+                out.statement('C%s\t%s' % (flags, content), '\n')
+    else:
+        # binary file
+        if args.max_bin_size >= 0 and size > args.max_bin_size:
+            hash_file(f, args.hash, out)
+        else:
+            p = subprocess.Popen(
+                bin_utils[args.bin_format],
+                4096,
+                stdin=f,
+                stdout=subprocess.PIPE,
+            )
+            out.statement('%s\t' % args.bin_format.upper(), '')
+            out.indent_copy(p.stdout)
+            if p.wait():
+                raise RuntimeError(
+                    "calling %r on %r failed with %r" % (
+                        bin_utils[args.bin_format],
+                        f.name,
+                        p.returncode,
+                    )
+                )
+
+
+def process_file(args, relname, rel_to, s, out):
+    fname = os.path.join(rel_to, relname)
+    # s = os.lstat(fname)
+    ftype = s.st_mode >> 12
+    mode = s.st_mode & 07777
+    t = ftypes[ftype]
+
+    if not args.compact:
+        out.write('\n')
+
+    if '\t' in relname or '\n' in relname:
+        out.statement('P\t' + relname, '\t')
+    else:
+        if args.compact:
+            out.statement('/' + relname)
+        else:
+            out.statement('/' + relname, '\t')
+
+    if ftype != DIR and s.st_nlink > 1:
+        pass  # TODO: process hardlinks
+
+    if ftype == BLOCK or ftype == CHAR:
+        out.statement('%s%d:%d' % (t, s.st_rdev >> 8, s.st_rdev & 255))
+    elif ftype == LINK:
+        out.statement('l\t' + os.readlink(fname), '\t')
+    elif ftype == FILE:
+        if args.content:
+            with open(fname, 'rb') as f:
+                process_content(args, f, s.st_size, out)
+        else:
+            out.statement('f')
+    else:
+        out.statement(t)
+
+    if args.owner:
+        out.statement('o%d:%d' % (s.st_uid, s.st_gid))
+
+    if args.mode:
+        out.statement('m%04o' % mode,)
+
+    out.statement_end()
+
+
+def statwalk(root, top='', sort_func=sorted, exclude=None):
+    names = os.listdir(os.path.join(root, top))
+    stats = [os.lstat(os.path.join(root, top, name)) for name in names]
+
+    for (name, s) in sort_func(zip(names, stats)):
+        relname = os.path.join(top, name)
+        if exclude is not None and exclude(root, top, name, s):
+            continue
+        yield (relname, s)
+        if DIR == s.st_mode >> 12:
+            # We don't have readdir() and directory filedescriptors available
+            # so we will keep just appending to the path.
+            for x in statwalk(root, relname):
+                yield x
+
+
+def process_root(args, root, out):
+    for relname, s in statwalk(root):
+        process_file(args, relname, root, s, out)
+
+
+def main(args):
+    out = Output(sys.stdout)
+
+    if not args.root:
+        process_root(args, '.', out)
+    else:
+        for root in args.root:
+            process_root(args, root, out)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    main(args)

	fileset git mirror of https://ccx.te2000.cz/bzr/fileset
	git clone https://ccx.te2000.cz/git/fileset
	Log \| Files \| Refs \| README