Added website archiver script using httrack over Docker. Still not working correctly.

This commit is contained in:
David Ball 2024-06-03 16:42:29 +00:00
parent 0a0e5e4b52
commit 981ff4f04e
4 changed files with 78 additions and 0 deletions

View File

@ -0,0 +1,14 @@
FROM ubuntu:noble
ARG DEBIAN_FRONTEND="noninteractive"
ARG GIT_REPO_URL=""
ENV TZ="${TZ-America/New_York}"
WORKDIR /archived
VOLUME /archived
USER ubuntu
RUN sh -c
CMD ["httrack "]

View File

@ -0,0 +1,6 @@
#!/bin/sh
REPO_DIR="../../nm3clol-archived-russellcountyida.org"
URLS="https://russellcountyida.org/ https://www.russellcountyida.org/" \
MIRROR_DIR="mirror" \
./sync-website-archive.sh

View File

@ -0,0 +1,6 @@
#!/bin/sh
REPO_DIR="../../nm3clol-archived-russellcountyva.us" \
URLS="https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2024\&endDate=12/31/2024\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2023\&endDate=12/31/2023\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2022\&endDate=12/31/2022\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2021\&endDate=12/31/2021\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2020\&endDate=12/31/2020\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2019\&endDate=12/31/2019\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2018\&endDate=12/31/2018\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2017\&endDate=12/31/2017\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2016\&endDate=12/31/2016\&dateRange=\&dateSelector= https://www.russellcountyva.us/" \
MIRROR_DIR="mirror" \
./sync-website-archive.sh

View File

@ -0,0 +1,52 @@
#!/bin/sh
echo REPO_DIR=$REPO_DIR
echo REPO_URL=$REPO_URL
echo URLS=$URLS
echo OPEN_PWD=$(pwd)
cd $REPO_DIR
if [ "$(ls -A $REPO_DIR)" ]; then
git pull
# echo httrack --update
else
git clone $REPO_URL .
fi;
rm -rf ./mirror
httrack -wqiC2%Ps0u1%s%uN0%Ip3DaK0H1%kf2A125000%f#f -%F "<!-- Mirrored from %s%s by HTTrack Website Copier/3.x [XR&CO'2014], %s -->" -%l "en, *" $URLS -O1 ./mirror +*.png +*.gif +*.jpg +*.jpeg +*.css +*.js +*.pdf +*.doc +*.docx +*.xls +*.xlsx +*.csv +*.odt +*.ppt +*.pptx +*.epub +*.webp +*.webm +*.mkv +*.mpg +*.mpeg +*.mov +*.wav +*.mp3 -mime:application/*
git add ./mirror/
MESSAGE="$(cat ./mirror/hts-log.txt | head -n 1 | sed -E 's/(.*) at .*/\"Synced with \1.\"/g')"
git commit -m "$MESSAGE"
DATE_YEAR="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ ([[:digit:]]+).*/\1/g')"
DATE_MONTH="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ ([[:alpha:]]+).*/\1/g')"
case "$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ ([[:alpha:]]+).*/\1/g')" in
Jan) DATE_MONTH="01" ;;
Feb) DATE_MONTH="02" ;;
Mar) DATE_MONTH="03" ;;
Apr) DATE_MONTH="04" ;;
May) DATE_MONTH="05" ;;
Jun) DATE_MONTH="06" ;;
Jul) DATE_MONTH="07" ;;
Aug) DATE_MONTH="08" ;;
Sep) DATE_MONTH="09" ;;
Oct) DATE_MONTH="10" ;;
Nov) DATE_MONTH="11" ;;
Dec) DATE_MONTH="12" ;;
esac
DATE_DAY="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, ([[:digit:]]+).*/\1/g')"
DATE_HOUR="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ [[:digit:]]+ ([[:digit:]]+).*/\1/g')"
DATE_MINUTE="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ [[:digit:]]+ [[:digit:]]+:([[:digit:]]+).*/\1/g')"
DATE_SECOND="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ [[:digit:]]+ [[:digit:]]+:[[:digit:]]+:([[:digit:]]+).*/\1/g')"
VERSION_TAG_DATE="$DATE_YEAR-$DATE_MONTH-$DATE_DAY-$DATE_HOUR-$DATE_MINUTE-$DATE_SECOND"
MOST_RECENT_COMMIT="$(git rev-parse HEAD)"
git tag $VERSION_TAG_DATE $MOST_RECENT_COMMIT
git push --all
git push --tags
cd $OPEN_PWD