From 981ff4f04e1c74d41a5675bff59e4ce5b12a7f3d Mon Sep 17 00:00:00 2001 From: David Ball Date: Mon, 3 Jun 2024 16:42:29 +0000 Subject: [PATCH] Added website archiver script using httrack over Docker. Still not working correctly. --- website-archiver/httrack.Dockerfile | 14 +++++ website-archiver/sync-russellcountyida.org.sh | 6 +++ website-archiver/sync-russellcountyva.us.sh | 6 +++ website-archiver/sync-website-archive.sh | 52 +++++++++++++++++++ 4 files changed, 78 insertions(+) create mode 100644 website-archiver/httrack.Dockerfile create mode 100755 website-archiver/sync-russellcountyida.org.sh create mode 100755 website-archiver/sync-russellcountyva.us.sh create mode 100755 website-archiver/sync-website-archive.sh diff --git a/website-archiver/httrack.Dockerfile b/website-archiver/httrack.Dockerfile new file mode 100644 index 0000000..ca53a9b --- /dev/null +++ b/website-archiver/httrack.Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:noble + +ARG DEBIAN_FRONTEND="noninteractive" +ARG GIT_REPO_URL="" +ENV TZ="${TZ-America/New_York}" + +WORKDIR /archived +VOLUME /archived + +USER ubuntu + +RUN sh -c + +CMD ["httrack "] diff --git a/website-archiver/sync-russellcountyida.org.sh b/website-archiver/sync-russellcountyida.org.sh new file mode 100755 index 0000000..45a3f9f --- /dev/null +++ b/website-archiver/sync-russellcountyida.org.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +REPO_DIR="../../nm3clol-archived-russellcountyida.org" +URLS="https://russellcountyida.org/ https://www.russellcountyida.org/" \ +MIRROR_DIR="mirror" \ +./sync-website-archive.sh diff --git a/website-archiver/sync-russellcountyva.us.sh b/website-archiver/sync-russellcountyva.us.sh new file mode 100755 index 0000000..7fcb088 --- /dev/null +++ b/website-archiver/sync-russellcountyva.us.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +REPO_DIR="../../nm3clol-archived-russellcountyva.us" \ +URLS="https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2024\&endDate=12/31/2024\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2023\&endDate=12/31/2023\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2022\&endDate=12/31/2022\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2021\&endDate=12/31/2021\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2020\&endDate=12/31/2020\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2019\&endDate=12/31/2019\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2018\&endDate=12/31/2018\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2017\&endDate=12/31/2017\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2016\&endDate=12/31/2016\&dateRange=\&dateSelector= https://www.russellcountyva.us/" \ +MIRROR_DIR="mirror" \ +./sync-website-archive.sh diff --git a/website-archiver/sync-website-archive.sh b/website-archiver/sync-website-archive.sh new file mode 100755 index 0000000..42fd364 --- /dev/null +++ b/website-archiver/sync-website-archive.sh @@ -0,0 +1,52 @@ +#!/bin/sh + +echo REPO_DIR=$REPO_DIR +echo REPO_URL=$REPO_URL +echo URLS=$URLS + +echo OPEN_PWD=$(pwd) +cd $REPO_DIR + +if [ "$(ls -A $REPO_DIR)" ]; then + git pull + # echo httrack --update +else + git clone $REPO_URL . +fi; +rm -rf ./mirror +httrack -wqiC2%Ps0u1%s%uN0%Ip3DaK0H1%kf2A125000%f#f -%F "" -%l "en, *" $URLS -O1 ./mirror +*.png +*.gif +*.jpg +*.jpeg +*.css +*.js +*.pdf +*.doc +*.docx +*.xls +*.xlsx +*.csv +*.odt +*.ppt +*.pptx +*.epub +*.webp +*.webm +*.mkv +*.mpg +*.mpeg +*.mov +*.wav +*.mp3 -mime:application/* + +git add ./mirror/ +MESSAGE="$(cat ./mirror/hts-log.txt | head -n 1 | sed -E 's/(.*) at .*/\"Synced with \1.\"/g')" + +git commit -m "$MESSAGE" + +DATE_YEAR="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ ([[:digit:]]+).*/\1/g')" +DATE_MONTH="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ ([[:alpha:]]+).*/\1/g')" +case "$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ ([[:alpha:]]+).*/\1/g')" in + Jan) DATE_MONTH="01" ;; + Feb) DATE_MONTH="02" ;; + Mar) DATE_MONTH="03" ;; + Apr) DATE_MONTH="04" ;; + May) DATE_MONTH="05" ;; + Jun) DATE_MONTH="06" ;; + Jul) DATE_MONTH="07" ;; + Aug) DATE_MONTH="08" ;; + Sep) DATE_MONTH="09" ;; + Oct) DATE_MONTH="10" ;; + Nov) DATE_MONTH="11" ;; + Dec) DATE_MONTH="12" ;; +esac +DATE_DAY="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, ([[:digit:]]+).*/\1/g')" +DATE_HOUR="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ [[:digit:]]+ ([[:digit:]]+).*/\1/g')" +DATE_MINUTE="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ [[:digit:]]+ [[:digit:]]+:([[:digit:]]+).*/\1/g')" +DATE_SECOND="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ [[:digit:]]+ [[:digit:]]+:[[:digit:]]+:([[:digit:]]+).*/\1/g')" +VERSION_TAG_DATE="$DATE_YEAR-$DATE_MONTH-$DATE_DAY-$DATE_HOUR-$DATE_MINUTE-$DATE_SECOND" +MOST_RECENT_COMMIT="$(git rev-parse HEAD)" + +git tag $VERSION_TAG_DATE $MOST_RECENT_COMMIT + +git push --all +git push --tags + +cd $OPEN_PWD