# use wget to mirror a site
# $1 is site URL
# - format is http(s)://www.sitename.com/ (or no slash at end)
# ${1%%+(/)} removes all slashes at end
# ${1%/} removes 1 slash at end
Mirror_Start=`date`
MirrorLog="`basename $1`/_MirrorSite.log"
mkdir `basename $1`
MirrorDir="${1%/}"
# With Fix of URLs after download
WgetParms=" -E -Kk --mirror -p -e robots=off"
# Without Fix of URLs after download
#WgetParms=" --mirror -e robots=off"
# Set extglob setting in bash for pattern matching (used for "${1%/}" )
shopt -s extglob
# Mirror from default home page
wget $WgetParms $1 > $MirrorLog
# Mirror from /sitemap.xml if it exists
echo '========================== Trying sitemap.xml =============================' | tee -a $MirrorLog
if wget -q -O- "$MirrorDir/sitemap.xml" ; then
cat sitemap.xml | grep -o '<loc>.*</loc>' | grep -o 'http[^<"]*' | xargs wget $WgetParms >>$MirrorLog
echo 'Sitemap='"|$MirrorDir/sitemap.xml|"
else
echo 'Sitemap='"|$MirrorDir/sitemap.xml| FAILED! Error=$?" | tee -a $MirrorLog
fi
if wget -q -O- "$MirrorDir/sitemap/" ; then
cat sitemap | grep -o '<loc>.*</loc>' | grep -o 'http[^<"]*' | xargs wget $WgetParms >>$MirrorLog
echo 'Sitemap='"|$MirrorDir/sitemap/|"
else
echo 'Sitemap='"|$MirrorDir/sitemap/| FAILED! Error=$?" | tee -a $MirrorLog
fi
echo $Mirror_Start "<-- Start time" | tee -a $MirrorLog
echo `date` "<---- End Time" | tee -a $MirrorLog