@Jellby: Right.... Not exactly a one liner anymore, and the script has all the clarity of white noise. Couldn't be bothered to double-check @font-face in css, and tags with a font inside another tag with a font is not handled correctly (the outer tag will include characters from the inner tag). Typically used as :
./script.sh <xhtml-file> <css-file>
If the css is inline, it's just
./script.sh <xhtml-file>
Tries to detect extra fonts in the css-file, which classes/ids which use them, and lists which characters are used for which fonts.
Code:
#!/bin/bash
file=$1
css=${!#}
xmlns=$(grep -o "xmlns=.[^\"']\+" ${file}|cut -c8-180)
# remove comments
awk 'BEGIN{RS="\(<.\-\-\|\-\->\)"} {if ((NR % 2)==1) print;}' $file > tmp
#replace html entities
for x in $(sed 's/&[a-zA-Z0-9]\+;/&\n/g' tmp|grep -o "&[a-zA-Z0-9]\+;")
do sed -i "s/${x}/$(echo $x|recode HTML..UTF-8)/g" tmp
done
# extract inline css
(if [[ $(grep -Fxq '</style' $css) ]]
then sed -n /<style/,/<\/style/p $css
else cat $css
fi)|\
tr "\n" " " |\
sed 's/[>}]/&\n/g' |\
grep -v "@font-face" |\
sed -n "/font-family: *[\"']/{s/^ *\(.*\) *{.*font-family: *[\"']\([^\"']\+\).*/\1 \2/;p}" |\
sed -e '/^\./s/^/\*#/' -e 's#\(.*\)\.\([^ \]\+\)#a:\1[@class="\2"]#' -e 's/.*#\([^ ]*\)/*[@id="\1"]/'|\
while read line
do
echo
echo "${line#* }: "
echo -e "setns a=${xmlns}\ncat //${line%% *}//text()" |\
xmllint --shell tmp |\
sed -e 1d -e '/^\(\/ >\| -\{7\}$\)/d' -e 's/./&\n/g' |\
sort -u |\
sed '/[ \t]/d' |\
sed -n 'H;${x;s/\n//g;p}'
done