User Tools

Site Tools


packet:xrouter:manpages:parsing

This is an old revision of the document!


Script to parse Xrouter's MAN and HLP files

parse-pzt-manhlp.sh
#!/bin/bash
##################################
# by Jason M0MZF (not a programmer!)
# bash / awk / hammer / nail etc.
# License - MIT. Crack on people.
# 
# Script to parse Paula G8PZT's Xrouter MAN and HLP files into DocuWiki
# "some simple markup language" (SSML). DocuWiki ML syntax is here:
# https://www.dokuwiki.org/wiki:syntax
#
# The intention is to parse all MAN / HLP files within the folders and
# write them with appropriate formatting to files which can then be
# pasted directly into the wiki.
#
# This could also be done with groff > HTML > pandoc > ssml but pandoc's
# output format for SSML doesn't sort out proper ====headers==== and I 
# don't know Lua. Yet. Maybe something like this with a custom output formatter:
# cat ${manpage} | groff -Thtml -P -l -mmandoc 2>/dev/null | pandoc -f html -t dokuwiki -o "$manpage".txt
# But when all you've got is awk, everything looks like a record / field... ;)
#
##################################
# Instructions (destructions?)
#
# - This script does not take any arguments
# - The only required configuration is to set the following path
#
BASEPATH=/home/jason/radio/packet/xrouter/Xrouter
#
# This folder should contain the two directories "XRouter Help Files"
# and "XRouter Manual Files". A new directory will be created here
# called "docuwiki-date" and contains two directories for the 
# concatenated and reformatted files. A manually-created
# index page exists in https://wiki.oarc.uk/packet:xrouter:manpages with
# top-level contents, and the pages linked therein have their contents
# copypasta'd from this script's output.
#
##################################
# Changelog
# 20250418 - Implemented MAN page parsing
# 20250419 - Implemented HLP page parsing
# 20250419 - Tidy up, more awk less bash, remove .MAN / .HLP from outputted headers
##################################
 
# Globals
DATE=$(date +"%Y%m%d-%H%M%S")
MANFILES="$BASEPATH/XRouter Manual Files"
HLPFILES="$BASEPATH/XRouter Help Files"
OUTPUTDIR="$BASEPATH/docuwiki-$DATE"
 
# Colourise output
echoRed () {
	echo -e "\e[1;31m----$1----\e[0m"
}
echoGreen () {
	echo -e "\e[1;32m----$1----\e[0m"
}
 
checkRoot () {
	if [[ $UID -eq 0 ]]; 
	then
		echoError "This script must NOT be run as root!"
		exit 1
	fi
}
 
# Use awk to:
#  strip out comment lines and remove any <CR> from <CR><LF> lines
#  turn the MAN page header into a code block, it contains a revision date
#  find every subsequent MAN page header and turn it into a docuwiki header and
#   terminate the previous code block before the header, then
#   create a new code block after the header, thus encapsulating the subsection
# (the final encapsulation is done using "echo" in the bash script below)
awkParseMan='
{
	if (NR==1 || NR==2)				# For the first two lines
	{
		gsub("\r", "")				# remove all carriage return chars
		if (/^;/ || NF==0) {next}		# skip the subsequent print function for comment or empty lines
		print "<code>" $0			# annd output the line prepended with a code block start
	}
 
	if (NR>=3)					# For the other lines
	{
		if (/^[A-Z]/) 				# If the line begins with a character
		{
			starthead="</code> **"		# end previous code block
			endhead="** <code>"		# set bold and start code block
			gsub("\r", "")			# remove all carriage return chars
			print starthead $0 endhead 	# and output the line
		}
		else 					# else for all other lines
		{
			if (/^;/) {next}		# skip comment lines
			gsub("\r", "")			# remove all carriage return chars
			print $0			# and output the line
		}
	}
}
'
# Use awk to:
#  strip out comment lines (this is always line 1, sometime 2 and 3) and remove any <CR> from <CR><LF> lines
#  insert a start code block in place of the now-empty line 1
# (the final encapsulation is done using "echo" in the bash script below)
awkParseHlp='
{
	endhead="<code>"
	gsub("\r", "")					# remove all carriage return chars
	if (NR==1) {print endhead}			# start code block on first line
	if (/^;/ || NF==0) {next}			# skip comment / empty lines
	print $0					# output the refined line
}
'
 
# Use awk to extract a section name from the directory structure
awkSectionHeader='
	BEGIN { FS="/" }				# Set field separator to get section name from path
{							# /path/looks/like/this/Section Header Name/
	header="======="				# create header formatting
	print header $(NF-1) header			# the penultimate field is the section name
}
'
 
# Use awk to extract a name from the filename.extension
awkFileHeader='
	BEGIN { FS="." }				# Set field separator to separate file extenstion
{							# because we want the file name from FILENAME.MAN
	header="====="					# create header formatting
	print header $1 header				# the first field is the file name
}
'
 
parseFiles () {
	mkdir -p "${OUTPUTDIR}/$1"
	# Traverse folders, skipping files in base directory
	for folder in "${!1}"/*/
	do
		# Get the penultimate field in file path, i.e. the section (folder) name
		section=$(echo $folder | awk  -F/ '{print $(NF-1)}')
		# Format the section name as a docuWiki header
		echo "$folder" | awk "$awkSectionHeader" >> "${OUTPUTDIR}"/"$1"/"${section}".docuwiki
		# Spit some stuff out to the shell
		echoRed "$section"
		# Traverse through files
		for file in "$folder"*
		do
			# Get the last field in file path, i.e. file name
			title=$(echo $file | awk -F/ '{print $NF}')
			# Format the file name as a docuwiki header
			echo "$title" | awk "$awkFileHeader" >> "${OUTPUTDIR}"/"$1"/"${section}".docuwiki
			case "$1" in
			# For MAN files, after awk has done it's job we need to remove the last line; this last line breaks
			# the following <code> statement and is just an EOF message, so we don't lose anything.
			MANFILES)	awk "$awkParseMan" "$file" | head -n -1 >> "${OUTPUTDIR}"/"$1"/"${section}".docuwiki
					echo -e "</code>\n----" >> "${OUTPUTDIR}"/"$1"/"${section}".docuwiki
			;;
			# For HLP files we don't want to remove the last line because that truly is real content
			HLPFILES)	awk "$awkParseHlp" "$file" >> "${OUTPUTDIR}"/"$1"/"${section}".docuwiki
					echo -e "</code>" >> "${OUTPUTDIR}"/"$1"/"${section}".docuwiki
			;;
			esac
		done
	done
}
 
#Let's go!
checkRoot
echoGreen "Parsing MAN files from $MANFILES"
parseFiles MANFILES
echoGreen "Parsing HLP files from $HLPFILES"
parseFiles HLPFILES
packet/xrouter/manpages/parsing.1745132877.txt.gz · Last modified: 2025/04/20 07:07 by m0mzf