fslist implementations based on cpio and GNU find/sort - fileset - git mirror of https://ccx.te2000.cz/bzr/fileset

commit 870d74697f374b90cf5ba3643cdb61c6cd572b3e
parent 4b5aa00a59b0bfae501233813f47e23d4087d15d
Author: Jan Pobrislo <ccx@webprojekty.cz>
Date:   Fri, 15 Dec 2017 01:49:11 +0100

fslist implementations based on cpio and GNU find/sort
Diffstat:
M README  | 5 ++++-
M bin/cpio2fs  | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
M bin/fslist  | 5 ++++-
A bin/fslist.pax  | 6 ++++++
A bin/fslist2  | 6 ++++++
A bin/fslist3  | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M bin/fsvimdiff  | 9 +++++++++

7 files changed, 459 insertions(+), 36 deletions(-)
diff --git a/README b/README
@@ -50,7 +50,7 @@ h<flags><tab><destination>
 
 c<flags><tab><content>
 	create plain file with specified content (content may not contain tab, see C)
-	a leading newline is appended if the content does not end in newline,
+	a final newline is appended if the content does not end in newline,
 	unless following flags are given:
 	"n" - always append a newline
 	"N" - never append a newline
@@ -58,6 +58,9 @@ c<flags><tab><content>
 b<flags><tab><content>
 	base64, TODO description
 
+s<flags><tab><content>
+	checksum, TODO description
+
 Commands that take whole statement:
 
 P<tab><path>
diff --git a/bin/cpio2fs b/bin/cpio2fs
@@ -3,6 +3,7 @@
 setopt no_unset warn_create_global no_multibyte
 
 typeset -gA cpio_struct ftypes hardlinks
+typeset -ga cpio_oldbin_fields
 typeset -g delim filename ftype fmode
 
 ftypes=(  # convert hex type to mnemonic character
@@ -17,6 +18,19 @@ ftypes=(  # convert hex type to mnemonic character
 	1  p  # FIFO
 )
 
+cpio_oldbin_fields=(
+	c_magic 6
+	c_dev 6
+	c_ino 6
+	c_mode 6
+	c_uid 6
+	c_gid 6
+	c_nlink 6
+	c_rdev 6
+	c_mtime 11
+	c_namesize 6
+	c_filesize 11
+)
 
 ### Defaults for options passed by env {{{1
 
@@ -34,6 +48,8 @@ else
 	fi
 fi
 : ${print_x:=0}
+: ${max_bin_size:=}
+: ${max_newline_size:=1024}
 
 
 ### Generic helpers {{{1
@@ -46,48 +62,43 @@ die() {
 
 ### cpio header reader functions {{{1
 
-read_newc() {
-	local field
-	for field size in
-		c_magic 6 \
-		c_dev 6 \
-		c_ino 6 \
-		c_mode 6 \
-		c_uid 6 \
-		c_gid 6 \
-		c_nlink 6 \
-		c_rdev 6 \
-		c_mtime 11 \
-		c_namesize 6 \
-		c_filesize 11
-	do
-		read -k size || die "Short read of cpio archive header"
-		[[ $REPLY =~ '^[0-7]{'$size'}$' ]] \
-			|| die "Invalid octal header value: ${(qqq)REPLY}"
-		cpio_struct[$field]=$(( [##8] $REPLY ))
+read_oldc_header() {
+	local field bytes
+	for field size in $cpio_oldbin_fields; do
+		IFS= read -k $size -u 0 bytes || die "Short read of cpio archive header"
+		[[ $bytes =~ '^[0-7]{'$size'}$' ]] \
+			|| die "Invalid octal header value: ${(qqq)bytes}"
+		cpio_struct[$field]=$(( [##8] $bytes ))
 	done
-	cpio_struct[rdev_major]=$(( $s[rdev] >> 8 ))
-	cpio_struct[rdev_minor]=$(( $s[rdev] & 255 ))
+	cpio_struct[rdev_major]=$(( $cpio_struct[rdev] >> 8 ))
+	cpio_struct[rdev_minor]=$(( $cpio_struct[rdev] & 255 ))
 }
 
-read_newc() {
-	local field
+read_newc_header() {
+	local field bytes
 	cpio_struct=( )
 	for field in c_ino c_mode c_uid c_gid c_nlink c_mtime c_filesize \
 		c_devmajor c_devminor c_rdevmajor c_rdevminor c_namesize c_check
 	do
-		read -k 8 || die "Short read of cpio archive header"
-		[[ $REPLY =~ '^[0-7a-fA-F]{8}$' ]] \
-			|| die "Invalid hexadecimal header value: ${(qqq)REPLY}"
-		cpio_struct[$field]=$(( [##16] $REPLY ))
+		IFS= read -k 8 -u 0 bytes || die "Short read of cpio archive header"
+		[[ $bytes =~ '^[0-9a-fA-F]{8}$' ]] \
+			|| die "Invalid hexadecimal header value: ${(qqq)bytes}"
+		cpio_struct[$field]=$(( 0x$bytes ))
 	done
 	cpio_struct[c_dev]=$cpio_struct[c_devmajor]:$cpio_struct[c_devminor]
+	cpio_struct[path_padding]=$[ 3-((3+2+$cpio_struct[c_namesize])%4 ) ]
+	cpio_struct[data_padding]=$[ 3-((3+$cpio_struct[c_filesize])%4) ]
 }
 
 read_cpio_header() {
-	read -k 6 || return $?
-	cpio_struct=( c_magic $REPLY )
-	case "$c_magic" in
+	local magic_bytes
+	IFS= read -k 6 -u 0 magic_bytes || return $?
+	cpio_struct=(
+		c_magic $magic_bytes
+		path_padding 0
+		data_padding 0
+	)
+	case "$cpio_struct[c_magic]" in
 		(070707)  # oldc (aka Portable ASCII Format)
 			read_oldc_header;;
 		(070701)  # newc (aka New ASCII Format)
@@ -95,7 +106,7 @@ read_cpio_header() {
 		(070702)  # crc (aka New CRC Format) is same in structure as newc
 			read_newc_header;;
 		(*)
-			die "Unknown cpio format number: ${(qqq)c_magic}";;
+			die "Unknown cpio format number: ${(qqq)cpio_struct[c_magic]}";;
 	esac
 }
 
@@ -103,7 +114,7 @@ read_cpio_header() {
 
 statement() {
 	# start on new line for multiline statements - more readable
-	if [[ -n delim && $1 == *$'\n'* ]]; then
+	if [[ -n "$delim" && $1 == *$'\n'* ]]; then
 		delim=$'\n'
 	fi
 	printf '%s%s' $delim ${1//$'\n'/$'\n\t'}
@@ -117,14 +128,181 @@ statement_end() {
 
 
 process_file() {
-	local filename ftype fmode t
-	filename="$(dd bs=$cpio_struct[c_namesize] count=1 | tr -d '\0')" \
+	local filename ftype fmode t padding link
+	IFS= read -k $[$cpio_struct[c_namesize]-1] -u 0 filename \
 		|| die "Could not read filename"
+	IFS= read -k $[1 + $cpio_struct[path_padding]] -u 0 padding \
+		|| die "Could not read filename"
+	if [[ $filename == "TRAILER!!!"* ]]; then
+		return
+	fi
 	ftype=$(( [##16] $cpio_struct[c_mode] >> 12 ))
 	fmode=$(( [##8] $cpio_struct[c_mode] & 8#7777 ))
 	t=$ftypes[$ftype]
+	#printf "%s %s %s\n" >&2 $t $fmode "${(qqq)filename}"
+
+	# --- print stuff ---
+
+	(($compact)) || printf '\n'
+
+	if [[ $filename == . ]]; then
+		filename=/
+	elif [[ $filename == ./* ]]; then
+		filename=$filename[2,-1]
+	fi
+	if [[ $filename == *$'\t'* || $filename == *$'\n'* ]]; then
+		statement $'P\t'$filename $'\t'
+	else
+		if (($compact)); then
+			statement /${filename%/}
+		else
+			statement /${filename%/} $'\n'
+		fi
+	fi
+
+	# Note: not supporting hardlinks
+
+	if [[ $t == [bu] ]]; then
+		statement $t$cpio_struct[rdev_major]:$cpio_struct[rdev_minor]
+		(($cpio_struct[c_filesize])) && die "Unexpected data"
+	elif [[ $t == l ]]; then
+		(($cpio_struct[c_filesize])) || die "Missing data"
+		IFS= read -k $cpio_struct[c_filesize] -u 0 link \
+			|| die "Could not read symbolic link data"
+		statement $'l\t'$link $'\t'
+	elif [[ $t == f ]]; then
+		if (($print_c)); then
+			process_file_data $cpio_struct[c_filesize]
+		else
+			statement f
+			head -c$cpio_struct[c_filesize] >/dev/null \
+				die "Could not read data"
+		fi
+	else
+		statement $t
+		(($cpio_struct[c_filesize])) && die "Unexpected data"
+	fi
+
+	(($print_o)) && statement o$cpio_struct[c_uid]:$cpio_struct[c_gid]
+	(($print_m)) && statement m$fmode
+	statement_end
+
+	# --- read padding ---
+	if (($cpio_struct[data_padding])); then
+		IFS= read -k $cpio_struct[data_padding] -u 0 padding \
+			|| die "Could not read padding"
+		# dd status=none count=1 bs=$cpio_struct[data_padding] | xxd
+	fi
 }
 
+prhead() {
+	local remaining
+	remaining=$1
+	shift
+	printf '%s' "$@" || exit $?
+	if (($remaining)); then
+		head -c$remaining || exit $?
+	fi
+}
+
+process_file_data() {
+	local size head remaining
+	local -a print_data
+	size=$1
+	if ! (($cpio_struct[c_filesize])); then
+		statement cN$'\t'  # empty file
+		return
+	fi
+
+	if (($size > 256)); then  # read up to 256 bytes to determine file type
+		IFS= read -k 256 -u 0 head || die "Could not read file data"
+		remaining=$[ $cpio_struct[c_filesize] - 256 ]
+	else
+		IFS= read -k $cpio_struct[c_filesize] -u 0 head \
+			|| die "Could not read file data"
+		remaining=0
+	fi
+		
+	# print binary representation?
+	if (($print_b | $print_x)) && \
+		[[ $(printf '%s' "$head" | file -bi -) != text/* ]]
+	then
+		if (($remaining)); then
+			if [[ -n $max_bin_size && $size -gt $max_bin_size ]]; then
+				statement s$'\tSHA512:'${"$(prhead $remaining "$head" | sha512sum)"%% *} \
+					|| die "Could not read data"
+			elif (($print_x)); then
+				if [[ -n "$delim" ]]; then
+					printf '\nX'
+				else
+					printf 'X'
+				fi
+				prhead $remaining "$head" | xxd | sed 's/^/\t/'
+				((${(j.|.)pipestatus})) && die "Could not read data"
+				statement_end
+			else
+				if [[ -n "$delim" ]]; then
+					printf '\nB'
+				else
+					printf 'B'
+				fi
+				prhead $remaining "$head" | base64 | sed 's/^/\t/'
+				((${(j.|.)pipestatus})) && die "Could not read data"
+				statement_end
+			fi
+		else
+			if [[ -n $max_bin_size && $size -gt $max_bin_size ]]; then
+				statement s$'\tSHA512:'${"$(printf '%s' "$head" | sha512sum)"%% *}
+			elif (($print_x)); then
+				statement X$'\t'"$(printf '%s' "$head" | xxd)" $'\n'
+			else
+				statement B$'\t'"$(printf '%s' "$head" | base64)" $'\n'
+			fi
+		fi
+		return
+	fi
+	# print text
+
+	# if file is longer than this, always use CN
+	if (($size > $max_newline_size)); then
+		if [[ -n "$delim" ]]; then
+			printf '\nCN\t'
+		else
+			printf 'CN\t'
+		fi
+		# Swap NL with @ so trailing newline is handled correctly
+		prhead $remaining "$head" \
+			| tr '\n@' '@\n' \
+			| sed 's/@/@\t/g' \
+			| tr '@\n' '\n@'
+		((${(j.|.)pipestatus})) && die "Could not read data"
+		statement_end
+		return
+	fi
+
+	# read and then print out, determinig trailing newline flags
+	local content flags
+	if (($remaining)); then
+		IFS= read -r -d '' -u 0 -k $remaining content \
+			|| die "Could not read file data"
+	fi
+	content=$head$content
+	flags=''
+	if [[ $content == *$'\n' ]]; then
+		content=${content%$'\n'}
+		if [[ $content == *$'\n' ]]; then
+			# force appending newline
+			flags+=n
+		fi
+	else
+		flags+=N
+	fi
+	if ! (($compact)) || [[ $content == *$'\t'* || $content == *$'\n'* ]]; then
+		statement C$flags$'\t'$content $'\n'
+	else
+		statement c$flags$'\t'$content
+	fi
+}
 
 ### Mainloop {{{1
 while read_cpio_header; do
diff --git a/bin/fslist b/bin/fslist
@@ -48,6 +48,7 @@ else
 	fi
 fi
 : ${print_x:=0}
+: ${max_bin_size:=}
 
 fnames=( )
 for arg in "${@:-$ROOT}"; do
@@ -96,7 +97,9 @@ for fname in $fnames; do
 		if (($print_c)); then
 			if (($s[size])) && (($print_b + $print_x)) && \
 					[[ $(file -bi $fname) != text/* ]]; then
-				if (($print_x)); then
+				if [[ -n $max_bin_size && $s[size] -gt $max_bin_size ]]; then
+					statement s$'\tSHA512:'${"$(sha512sum < $fname)"%% *}
+				elif (($print_x)); then
 					statement X$'\t'"$(xxd $fname)" $'\n'
 				else
 					statement B$'\t'"$(base64 <$fname)" $'\n'
diff --git a/bin/fslist.pax b/bin/fslist.pax
@@ -0,0 +1,6 @@
+#!/bin/zsh
+if [[ -n $ROOT ]]; then
+	cd $ROOT || exit $?
+fi
+pax -w -x sv4cpio "$@" | cpio2fs
+exit $((${(j.|.)pipestatus}))
diff --git a/bin/fslist2 b/bin/fslist2
@@ -0,0 +1,6 @@
+#!/bin/zsh
+if [[ -n $ROOT ]]; then
+	cd $ROOT || exit $?
+fi
+find "$@" -print0 | sort -z | cpio -o -0 -H newc | cpio2fs
+exit $((${(j.|.)pipestatus}))
diff --git a/bin/fslist3 b/bin/fslist3
@@ -0,0 +1,218 @@
+#!/bin/zsh
+setopt no_unset warn_create_global no_multibyte
+zmodload zsh/stat
+
+typeset -gA ftypes hardlinks s
+typeset -g delim find fname ftype fmode
+
+ftypes=(  # convert hex type to mnemonic character
+	C  s  # socket
+	c  s
+	A  l  # symbolic link
+	a  l
+	8  f  # regular file
+	6  b  # block device
+	4  d  # directory
+	2  u  # character device
+	1  p  # FIFO
+)
+
+### Defaults for options passed by env {{{1
+
+# TODO: make into command-line arguments
+: ${compact:=1}
+: ${print_m:=1}
+: ${print_o:=1}
+: ${print_c:=1}
+: ${print_s:=1}
+if (($+commands[file])) && (($+commands[base64])); then
+	: ${print_b:=1}
+else
+	: ${print_b:=0}
+	if (($+commands[file])) && (($+commands[xxd])); then
+		: ${print_x:=1}
+	fi
+fi
+: ${print_x:=0}
+: ${max_bin_size:=}
+: ${max_newline_size:=1024}
+
+### Generic helpers {{{1
+
+die() {
+	printf '%s\n' "$@"
+	exit 1
+}
+
+### FileSet writer functions {{{1
+
+statement() {
+	# start on new line for multiline statements - more readable
+	if [[ -n "$delim" && $1 == *$'\n'* ]]; then
+		delim=$'\n'
+	fi
+	printf '%s%s' $delim ${1//$'\n'/$'\n\t'}
+	delim=${2:-$'\t'}
+}
+
+statement_end() {
+	printf '\n'
+	delim=''
+}
+
+
+process_file() {
+	local -a find_info
+	local t fmode owner size filename
+	find_info=( $=1 )
+	t=$find_info[1]
+	fmode=$find_info[2]
+	owner=$find_info[3]
+	size=$find_info[4]
+
+	# --- print stuff ---
+
+	(($compact)) || printf '\n'
+
+	filename="$fname"
+	if [[ $filename == . ]]; then
+		filename=/
+	elif [[ $filename == ./* ]]; then
+		filename=$filename[2,-1]
+	fi
+	filename=${filename#/}
+
+	# printf "%s %s %s\n" >&2 $t $fmode "${(qqq)filename}"
+
+	if [[ $filename == *$'\t'* || $filename == *$'\n'* ]]; then
+		statement $'P\t'$filename $'\t'
+	else
+		if (($compact)); then
+			statement /${filename}
+		else
+			statement /${filename} $'\n'
+		fi
+	fi
+
+	# Note: not supporting hardlinks (yet)
+
+	if [[ $t == [bu] ]]; then
+		zstat -LH s $fname || die "stat failed on ${(qqq)fname}"
+		statement $t$(( $s[rdev] >> 8 )):$(( $s[rdev] & 255 ))
+	elif [[ $t == l ]]; then
+		zstat -LH s $fname || die "stat failed on ${(qqq)fname}"
+		statement $'l\t'$s[link] $'\t'
+	elif [[ $t == f ]]; then
+		if (($print_c)); then
+			process_file_data $size
+		else
+			statement f
+			if (($print_s)); then
+				statement s$'\tSHA512:'${"$(sha512sum <$fname)"%% *} \
+					|| die "Could not read ${(qqq)fname}"
+			fi
+		fi
+	else
+		statement $t
+	fi
+
+	(($print_o)) && statement o$owner
+	(($print_m)) && statement m$fmode
+	statement_end
+}
+
+process_file_data() {
+	local size
+	size=$1
+	if ! (($size)); then
+		statement cN$'\t'  # empty file
+		return
+	fi
+
+	# print binary representation?
+	if (($print_b | $print_x)) && \
+		[[ $(file -bi "$fname") != text/* ]]
+	then
+		if (($size > 256)); then
+			if [[ -n $max_bin_size && $size -gt $max_bin_size ]]; then
+				statement s$'\tSHA512:'${"$(sha512sum <$fname)"%% *} \
+					|| die "Could not read ${(qqq)fname}"
+			elif (($print_x)); then
+				if [[ -n "$delim" ]]; then
+					printf '\nX'
+				else
+					printf 'X'
+				fi
+				xxd <$fname | sed 's/^/\t/'
+				((${(j.|.)pipestatus})) && die "Could not read ${(qqq)fname}"
+				statement_end
+			else
+				if [[ -n "$delim" ]]; then
+					printf '\nB'
+				else
+					printf 'B'
+				fi
+				 base64 <$fname | sed 's/^/\t/'
+				 ((${(j.|.)pipestatus})) && die "Could not read ${(qqq)fname}"
+				statement_end
+			fi
+		else
+			if [[ -n $max_bin_size && $size -gt $max_bin_size ]]; then
+				statement s$'\tSHA512:'${"$(sha512sum <$fname)"%% *}
+			elif (($print_x)); then
+				statement X$'\t'"$(xxd <$fname)" $'\n'
+			else
+				statement B$'\t'"$(base64 <$fname)" $'\n'
+			fi
+		fi
+		return
+	fi
+	# print text
+
+	# if file is longer than this, always use CN
+	if (($size > $max_newline_size)); then
+		if [[ -n "$delim" ]]; then
+			printf '\nCN\t'
+		else
+			printf 'CN\t'
+		fi
+		# Swap NL with @ so trailing newline is handled correctly
+		tr <$fname '\n@' '@\n' \
+			| sed 's/@/@\t/g' \
+			| tr '@\n' '\n@'
+		((${(j.|.)pipestatus})) && die "Could not read ${(qqq)fname}"
+		statement_end
+		return
+	fi
+
+	# read and then print out, determinig trailing newline flags
+	local content flags
+	content="$(<$fname)"
+	flags=''
+	if [[ $content == *$'\n' ]]; then
+		content=${content%$'\n'}
+		if [[ $content == *$'\n' ]]; then
+			# force appending newline
+			flags+=n
+		fi
+	else
+		flags+=N
+	fi
+	if ! (($compact)) || [[ $content == *$'\t'* || $content == *$'\n'* ]]; then
+		statement C$flags$'\t'$content $'\n'
+	else
+		statement c$flags$'\t'$content
+	fi
+}
+
+### Mainloop {{{1
+if (($+ROOT)) && [[ -n $ROOT ]]; then
+	cd $ROOT || exit $?
+fi
+
+find "$@" -printf '%y %m %U:%G %s\t%p\0' \
+	| sort -z -t $'\t' -k 2 \
+	| while IFS=$'\t' read -r -d $'\0' find fname
+do
+	process_file "$find" "$fname"
+done
diff --git a/bin/fsvimdiff b/bin/fsvimdiff
@@ -13,3 +13,12 @@ exec vim \
 	+"setlocal $opts" \
 	+"exe 'file '.\$ROOT2.'.fs'" \
 	+'norm ggzM'
+#exec vim \
+#	+'0r!ROOT="$ROOT1" fslist2 .' \
+#	+"setlocal $opts" \
+#	+"exe 'file '.\$ROOT1.'.fs'" \
+#	+'rightb vnew' \
+#	+'0r!ROOT="$ROOT2" fslist2 .' \
+#	+"setlocal $opts" \
+#	+"exe 'file '.\$ROOT2.'.fs'" \
+#	+'norm ggzM'

	fileset git mirror of https://ccx.te2000.cz/bzr/fileset
	git clone https://ccx.te2000.cz/git/fileset
	Log \| Files \| Refs \| README

M	README	\|	5	++++-
M	bin/cpio2fs	\|	246	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
M	bin/fslist	\|	5	++++-
A	bin/fslist.pax	\|	6	++++++
A	bin/fslist2	\|	6	++++++
A	bin/fslist3	\|	218	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	bin/fsvimdiff	\|	9	+++++++++