bin/fslist.py

#!/usr/bin/env python2
# vim: fileencoding=utf-8 ft=python et sw=4 ts=4 sts=4 tw=79

from __future__ import (
    generators, division, absolute_import, with_statement, print_function
)
import sys
import os.path
import argparse
import subprocess
import mmap

from magic import mime_magic


parser = argparse.ArgumentParser()
parser.add_argument('--max-bin-size', type=int, default=4096)
parser.add_argument('--max-text-size', type=int, default=-1)
parser.add_argument('--max-line-length', type=int, default=160)

parser.add_argument('--compact', action='store_true', default=False)
parser.add_argument('--no-compact', dest='compact', action='store_false')

parser.add_argument('--content', action='store_true', default=True)
parser.add_argument('--no-content', dest='content', action='store_false')

parser.add_argument('--owner', action='store_true', default=True)
parser.add_argument('--no-owner', dest='owner', action='store_false')

parser.add_argument('--mode', action='store_true', default=True)
parser.add_argument('--no-mode', dest='mode', action='store_false')

hash_utils = {
    'md5': ('md5sum',),
    'sha1': ('sha1sum',),
    'sha224': ('sha224sum',),
    'sha256': ('sha256sum',),
    'sha384': ('sha384sum',),
    'sha512': ('sha512sum',),
    'blake2': ('b2sum',),
}
parser.add_argument('--hash', default='sha512',
                    choices=tuple(hash_utils.keys()))

bin_utils = {
    'b': ('base64',),
    'x': ('xxd',),
}
parser.add_argument('--bin-format', default='b',
                    choices=tuple(bin_utils.keys()))

parser.add_argument('root', nargs='*')


ftypes = {  # convert numeric type to mnemonic character
    0xC: 's',  # socket
    0xA: 'l',  # symbolic link
    0x8: 'f',  # regular file
    0x6: 'b',  # block device
    0x4: 'd',  # directory
    0x2: 'u',  # character device
    0x1: 'p',  # FIFO
}

SOCKET = 0xC
LINK = 0xA
FILE = 0x8
BLOCK = 0x6
DIR = 0x4
CHAR = 0x2
FIFO = 0x1


class Output(object):
    def __init__(self, fobj):
        self.fobj = fobj
        self.delim = ''

    def write(self, data):
        self.fobj.write(data)

    def close(self):
        self.fobj.close()

    def statement(self, text, delim=''):
        # start on new line for multiline statements - more readable
        if self.delim and '\n' in text:
            self.delim = '\n'
        self.write(self.delim)
        self.write(text.replace('\n', '\n\t'))
        self.delim = delim or '\t'

    def statement_end(self):
        self.write('\n')
        self.delim = ''

    def indent_copy(self, source, strip_last=True):
        line = source.readline()
        had_endl = False
        while line:
            if had_endl:
                self.write('\t')
            had_endl = line[-1] == '\n'
            self.write(line)
            line = source.readline()
        if not had_endl:
            self.write('\n')
        elif not strip_last:
            self.write('\t\n')
        self.delim = ''


def hash_file(f, hash, out):
    out.statement('s\t%s:%s' % (
        hash.upper(),
        subprocess.check_output(hash_utils[hash], stdin=f).split(None, 1)[0],
    ))


def process_content(args, f, size, out):
    if not args.compact:
        out.delim = '\n'
    if mime_magic.descriptor(f.fileno()).startswith('text/'):
        # text file
        if args.max_text_size >= 0 and size > args.max_text_size:
            hash_file(f, args.hash, out)
        elif size > args.max_line_length:
            # Always use the multiline form for larger files
            mm = mmap.mmap(f.fileno(), 0, mmap.MAP_PRIVATE)
            try:
                flags = ('n' if mm[-1] == '\n' else 'N')
                out.statement('C%s\t' % flags, '')
                out.indent_copy(mm)
            finally:
                mm.close()
        else:
            content = f.read()
            flags = 'N'
            if content and content[-1] == '\n':
                content = content[:-1]
                flags = 'n'
            if args.compact and not (
                    '\n' in content or '\t' in content
            ):
                out.statement('c%s\t%s' % (flags, content))
            else:
                out.statement('C%s\t%s' % (flags, content), '\n')
    else:
        # binary file
        if args.max_bin_size >= 0 and size > args.max_bin_size:
            hash_file(f, args.hash, out)
        else:
            p = subprocess.Popen(
                bin_utils[args.bin_format],
                4096,
                stdin=f,
                stdout=subprocess.PIPE,
            )
            out.statement('%s\t' % args.bin_format.upper(), '')
            out.indent_copy(p.stdout)
            if p.wait():
                raise RuntimeError(
                    "calling %r on %r failed with %r" % (
                        bin_utils[args.bin_format],
                        f.name,
                        p.returncode,
                    )
                )


def process_file(args, relname, rel_to, s, out):
    fname = os.path.join(rel_to, relname)
    # s = os.lstat(fname)
    ftype = s.st_mode >> 12
    mode = s.st_mode & 07777
    t = ftypes[ftype]

    if not args.compact:
        out.write('\n')

    if '\t' in relname or '\n' in relname:
        out.statement('P\t' + relname, '\t')
    else:
        if args.compact:
            out.statement('/' + relname)
        else:
            out.statement('/' + relname, '\t')

    if ftype != DIR and s.st_nlink > 1:
        pass  # TODO: process hardlinks

    if ftype == BLOCK or ftype == CHAR:
        out.statement('%s%d:%d' % (t, s.st_rdev >> 8, s.st_rdev & 255))
    elif ftype == LINK:
        out.statement('l\t' + os.readlink(fname), '\t')
    elif ftype == FILE:
        if args.content:
            with open(fname, 'rb') as f:
                process_content(args, f, s.st_size, out)
        else:
            out.statement('f')
    else:
        out.statement(t)

    if args.owner:
        out.statement('o%d:%d' % (s.st_uid, s.st_gid))

    if args.mode:
        out.statement('m%04o' % mode,)

    out.statement_end()


def statwalk(root, top='', sort_func=sorted, exclude=None):
    names = os.listdir(os.path.join(root, top))
    stats = [os.lstat(os.path.join(root, top, name)) for name in names]

    for (name, s) in sort_func(zip(names, stats)):
        relname = os.path.join(top, name)
        if exclude is not None and exclude(root, top, name, s):
            continue
        yield (relname, s)
        if DIR == s.st_mode >> 12:
            # We don't have readdir() and directory filedescriptors available
            # so we will keep just appending to the path.
            for x in statwalk(root, relname):
                yield x


def process_root(args, root, out):
    for relname, s in statwalk(root):
        process_file(args, relname, root, s, out)


def main(args):
    out = Output(sys.stdout)

    if not args.root:
        process_root(args, '.', out)
    else:
        for root in args.root:
            process_root(args, root, out)


if __name__ == '__main__':
    args = parser.parse_args()
    main(args)