#!/bin/zsh
# Reads cpio archive and emits fileset file
setopt no_unset warn_create_global no_multibyte

typeset -gA cpio_struct ftypes hardlinks
typeset -ga cpio_oldbin_fields
typeset -g delim filename ftype fmode

ftypes=(  # convert hex type to mnemonic character
	C  s  # socket
	c  s
	A  l  # symbolic link
	a  l
	8  f  # regular file
	6  b  # block device
	4  d  # directory
	2  u  # character device
	1  p  # FIFO
)

cpio_oldbin_fields=(
	c_magic 6
	c_dev 6
	c_ino 6
	c_mode 6
	c_uid 6
	c_gid 6
	c_nlink 6
	c_rdev 6
	c_mtime 11
	c_namesize 6
	c_filesize 11
)

### Defaults for options passed by env {{{1

# TODO: make into command-line arguments
: ${compact:=1}
: ${print_m:=1}
: ${print_o:=1}
: ${print_c:=1}
if (($+commands[file])) && (($+commands[base64])); then
	: ${print_b:=1}
else
	: ${print_b:=0}
	if (($+commands[file])) && (($+commands[xxd])); then
		: ${print_x:=1}
	fi
fi
: ${print_x:=0}
: ${max_bin_size:=}
: ${max_newline_size:=1024}


### Generic helpers {{{1

die() {
	printf '%s\n' "$@"
	exit 1
}


### cpio header reader functions {{{1

read_oldc_header() {
	local field bytes
	for field size in $cpio_oldbin_fields; do
		IFS= read -k $size -u 0 bytes || die "Short read of cpio archive header"
		[[ $bytes =~ '^[0-7]{'$size'}$' ]] \
			|| die "Invalid octal header value: ${(qqq)bytes}"
		cpio_struct[$field]=$(( [##8] $bytes ))
	done
	cpio_struct[rdev_major]=$(( $cpio_struct[rdev] >> 8 ))
	cpio_struct[rdev_minor]=$(( $cpio_struct[rdev] & 255 ))
}

read_newc_header() {
	local field bytes
	cpio_struct=( )
	for field in c_ino c_mode c_uid c_gid c_nlink c_mtime c_filesize \
		c_devmajor c_devminor c_rdevmajor c_rdevminor c_namesize c_check
	do
		IFS= read -k 8 -u 0 bytes || die "Short read of cpio archive header"
		[[ $bytes =~ '^[0-9a-fA-F]{8}$' ]] \
			|| die "Invalid hexadecimal header value: ${(qqq)bytes}"
		cpio_struct[$field]=$(( 0x$bytes ))
	done
	cpio_struct[c_dev]=$cpio_struct[c_devmajor]:$cpio_struct[c_devminor]
	cpio_struct[path_padding]=$[ 3-((3+2+$cpio_struct[c_namesize])%4 ) ]
	cpio_struct[data_padding]=$[ 3-((3+$cpio_struct[c_filesize])%4) ]
}

read_cpio_header() {
	local magic_bytes
	IFS= read -k 6 -u 0 magic_bytes || return $?
	cpio_struct=(
		c_magic $magic_bytes
		path_padding 0
		data_padding 0
	)
	case "$cpio_struct[c_magic]" in
		(070707)  # oldc (aka Portable ASCII Format)
			read_oldc_header;;
		(070701)  # newc (aka New ASCII Format)
			read_newc_header;;
		(070702)  # crc (aka New CRC Format) is same in structure as newc
			read_newc_header;;
		(*)
			die "Unknown cpio format number: ${(qqq)cpio_struct[c_magic]}";;
	esac
}

### FileSet writer functions {{{1

statement() {
	# start on new line for multiline statements - more readable
	if [[ -n "$delim" && $1 == *$'\n'* ]]; then
		delim=$'\n'
	fi
	printf '%s%s' $delim ${1//$'\n'/$'\n\t'}
	delim=${2:-$'\t'}
}

statement_end() {
	printf '\n'
	delim=''
}


process_file() {
	local filename ftype fmode t padding link
	IFS= read -k $[$cpio_struct[c_namesize]-1] -u 0 filename \
		|| die "Could not read filename"
	IFS= read -k $[1 + $cpio_struct[path_padding]] -u 0 padding \
		|| die "Could not read filename"
	if [[ $filename == "TRAILER!!!"* ]]; then
		return
	fi
	ftype=$(( [##16] $cpio_struct[c_mode] >> 12 ))
	fmode=$(( [##8] $cpio_struct[c_mode] & 8#7777 ))
	t=$ftypes[$ftype]
	#printf "%s %s %s\n" >&2 $t $fmode "${(qqq)filename}"

	# --- print stuff ---

	(($compact)) || printf '\n'

	if [[ $filename == . ]]; then
		filename=/
	elif [[ $filename == ./* ]]; then
		filename=$filename[2,-1]
	fi
	if [[ $filename == *$'\t'* || $filename == *$'\n'* ]]; then
		statement $'P\t'$filename $'\t'
	else
		if (($compact)); then
			statement /${filename%/}
		else
			statement /${filename%/} $'\n'
		fi
	fi

	# Note: not supporting hardlinks

	if [[ $t == [bu] ]]; then
		statement $t$cpio_struct[rdev_major]:$cpio_struct[rdev_minor]
		(($cpio_struct[c_filesize])) && die "Unexpected data"
	elif [[ $t == l ]]; then
		(($cpio_struct[c_filesize])) || die "Missing data"
		IFS= read -k $cpio_struct[c_filesize] -u 0 link \
			|| die "Could not read symbolic link data"
		statement $'l\t'$link $'\t'
	elif [[ $t == f ]]; then
		if (($print_c)); then
			process_file_data $cpio_struct[c_filesize]
		else
			statement f
			head -c$cpio_struct[c_filesize] >/dev/null \
				die "Could not read data"
		fi
	else
		statement $t
		(($cpio_struct[c_filesize])) && die "Unexpected data"
	fi

	(($print_o)) && statement o$cpio_struct[c_uid]:$cpio_struct[c_gid]
	(($print_m)) && statement m$fmode
	statement_end

	# --- read padding ---
	if (($cpio_struct[data_padding])); then
		IFS= read -k $cpio_struct[data_padding] -u 0 padding \
			|| die "Could not read padding"
		# dd status=none count=1 bs=$cpio_struct[data_padding] | xxd
	fi
}

prhead() {
	local remaining
	remaining=$1
	shift
	printf '%s' "$@" || exit $?
	if (($remaining)); then
		head -c$remaining || exit $?
	fi
}

process_file_data() {
	local size head remaining
	local -a print_data
	size=$1
	if ! (($cpio_struct[c_filesize])); then
		statement cN$'\t'  # empty file
		return
	fi

	if (($size > 256)); then  # read up to 256 bytes to determine file type
		IFS= read -k 256 -u 0 head || die "Could not read file data"
		remaining=$[ $cpio_struct[c_filesize] - 256 ]
	else
		IFS= read -k $cpio_struct[c_filesize] -u 0 head \
			|| die "Could not read file data"
		remaining=0
	fi
		
	# print binary representation?
	if (($print_b | $print_x)) && \
		[[ $(printf '%s' "$head" | file -bi -) != text/* ]]
	then
		if (($remaining)); then
			if [[ -n $max_bin_size && $size -gt $max_bin_size ]]; then
				statement s$'\tSHA512:'${"$(prhead $remaining "$head" | sha512sum)"%% *} \
					|| die "Could not read data"
			elif (($print_x)); then
				if [[ -n "$delim" ]]; then
					printf '\nX'
				else
					printf 'X'
				fi
				prhead $remaining "$head" | xxd | sed 's/^/\t/'
				((${(j.|.)pipestatus})) && die "Could not read data"
				statement_end
			else
				if [[ -n "$delim" ]]; then
					printf '\nB'
				else
					printf 'B'
				fi
				prhead $remaining "$head" | base64 | sed 's/^/\t/'
				((${(j.|.)pipestatus})) && die "Could not read data"
				statement_end
			fi
		else
			if [[ -n $max_bin_size && $size -gt $max_bin_size ]]; then
				statement s$'\tSHA512:'${"$(printf '%s' "$head" | sha512sum)"%% *}
			elif (($print_x)); then
				statement X$'\t'"$(printf '%s' "$head" | xxd)" $'\n'
			else
				statement B$'\t'"$(printf '%s' "$head" | base64)" $'\n'
			fi
		fi
		return
	fi
	# print text

	# if file is longer than this, always use CN
	if (($size > $max_newline_size)); then
		if [[ -n "$delim" ]]; then
			printf '\nCN\t'
		else
			printf 'CN\t'
		fi
		# Swap NL with @ so trailing newline is handled correctly
		prhead $remaining "$head" \
			| tr '\n@' '@\n' \
			| sed 's/@/@\t/g' \
			| tr '@\n' '\n@'
		((${(j.|.)pipestatus})) && die "Could not read data"
		statement_end
		return
	fi

	# read and then print out, determinig trailing newline flags
	local content flags
	if (($remaining)); then
		IFS= read -r -d '' -u 0 -k $remaining content \
			|| die "Could not read file data"
	fi
	content=$head$content
	flags=''
	if [[ $content == *$'\n' ]]; then
		content=${content%$'\n'}
		if [[ $content == *$'\n' ]]; then
			# force appending newline
			flags+=n
		fi
	else
		flags+=N
	fi
	if ! (($compact)) || [[ $content == *$'\t'* || $content == *$'\n'* ]]; then
		statement C$flags$'\t'$content $'\n'
	else
		statement c$flags$'\t'$content
	fi
}

### Mainloop {{{1
while read_cpio_header; do
	process_file
done