Added website archiver script using httrack over Docker. Still not working correctly.
This commit is contained in:
parent
0a0e5e4b52
commit
981ff4f04e
14
website-archiver/httrack.Dockerfile
Normal file
14
website-archiver/httrack.Dockerfile
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
FROM ubuntu:noble
|
||||||
|
|
||||||
|
ARG DEBIAN_FRONTEND="noninteractive"
|
||||||
|
ARG GIT_REPO_URL=""
|
||||||
|
ENV TZ="${TZ-America/New_York}"
|
||||||
|
|
||||||
|
WORKDIR /archived
|
||||||
|
VOLUME /archived
|
||||||
|
|
||||||
|
USER ubuntu
|
||||||
|
|
||||||
|
RUN sh -c
|
||||||
|
|
||||||
|
CMD ["httrack "]
|
6
website-archiver/sync-russellcountyida.org.sh
Executable file
6
website-archiver/sync-russellcountyida.org.sh
Executable file
|
@ -0,0 +1,6 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
REPO_DIR="../../nm3clol-archived-russellcountyida.org"
|
||||||
|
URLS="https://russellcountyida.org/ https://www.russellcountyida.org/" \
|
||||||
|
MIRROR_DIR="mirror" \
|
||||||
|
./sync-website-archive.sh
|
6
website-archiver/sync-russellcountyva.us.sh
Executable file
6
website-archiver/sync-russellcountyva.us.sh
Executable file
|
@ -0,0 +1,6 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
REPO_DIR="../../nm3clol-archived-russellcountyva.us" \
|
||||||
|
URLS="https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2024\&endDate=12/31/2024\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2023\&endDate=12/31/2023\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2022\&endDate=12/31/2022\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2021\&endDate=12/31/2021\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2020\&endDate=12/31/2020\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2019\&endDate=12/31/2019\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2018\&endDate=12/31/2018\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2017\&endDate=12/31/2017\&dateRange=\&dateSelector= https://www.russellcountyva.us/AgendaCenter/Search/?term=\&CIDs=all\&startDate=01/01/2016\&endDate=12/31/2016\&dateRange=\&dateSelector= https://www.russellcountyva.us/" \
|
||||||
|
MIRROR_DIR="mirror" \
|
||||||
|
./sync-website-archive.sh
|
52
website-archiver/sync-website-archive.sh
Executable file
52
website-archiver/sync-website-archive.sh
Executable file
|
@ -0,0 +1,52 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
echo REPO_DIR=$REPO_DIR
|
||||||
|
echo REPO_URL=$REPO_URL
|
||||||
|
echo URLS=$URLS
|
||||||
|
|
||||||
|
echo OPEN_PWD=$(pwd)
|
||||||
|
cd $REPO_DIR
|
||||||
|
|
||||||
|
if [ "$(ls -A $REPO_DIR)" ]; then
|
||||||
|
git pull
|
||||||
|
# echo httrack --update
|
||||||
|
else
|
||||||
|
git clone $REPO_URL .
|
||||||
|
fi;
|
||||||
|
rm -rf ./mirror
|
||||||
|
httrack -wqiC2%Ps0u1%s%uN0%Ip3DaK0H1%kf2A125000%f#f -%F "<!-- Mirrored from %s%s by HTTrack Website Copier/3.x [XR&CO'2014], %s -->" -%l "en, *" $URLS -O1 ./mirror +*.png +*.gif +*.jpg +*.jpeg +*.css +*.js +*.pdf +*.doc +*.docx +*.xls +*.xlsx +*.csv +*.odt +*.ppt +*.pptx +*.epub +*.webp +*.webm +*.mkv +*.mpg +*.mpeg +*.mov +*.wav +*.mp3 -mime:application/*
|
||||||
|
|
||||||
|
git add ./mirror/
|
||||||
|
MESSAGE="$(cat ./mirror/hts-log.txt | head -n 1 | sed -E 's/(.*) at .*/\"Synced with \1.\"/g')"
|
||||||
|
|
||||||
|
git commit -m "$MESSAGE"
|
||||||
|
|
||||||
|
DATE_YEAR="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ ([[:digit:]]+).*/\1/g')"
|
||||||
|
DATE_MONTH="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ ([[:alpha:]]+).*/\1/g')"
|
||||||
|
case "$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ ([[:alpha:]]+).*/\1/g')" in
|
||||||
|
Jan) DATE_MONTH="01" ;;
|
||||||
|
Feb) DATE_MONTH="02" ;;
|
||||||
|
Mar) DATE_MONTH="03" ;;
|
||||||
|
Apr) DATE_MONTH="04" ;;
|
||||||
|
May) DATE_MONTH="05" ;;
|
||||||
|
Jun) DATE_MONTH="06" ;;
|
||||||
|
Jul) DATE_MONTH="07" ;;
|
||||||
|
Aug) DATE_MONTH="08" ;;
|
||||||
|
Sep) DATE_MONTH="09" ;;
|
||||||
|
Oct) DATE_MONTH="10" ;;
|
||||||
|
Nov) DATE_MONTH="11" ;;
|
||||||
|
Dec) DATE_MONTH="12" ;;
|
||||||
|
esac
|
||||||
|
DATE_DAY="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, ([[:digit:]]+).*/\1/g')"
|
||||||
|
DATE_HOUR="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ [[:digit:]]+ ([[:digit:]]+).*/\1/g')"
|
||||||
|
DATE_MINUTE="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ [[:digit:]]+ [[:digit:]]+:([[:digit:]]+).*/\1/g')"
|
||||||
|
DATE_SECOND="$(echo $MESSAGE | sed -E 's/.* on [[:alpha:]]+, [[:digit:]]+ [[:alpha:]]+ [[:digit:]]+ [[:digit:]]+:[[:digit:]]+:([[:digit:]]+).*/\1/g')"
|
||||||
|
VERSION_TAG_DATE="$DATE_YEAR-$DATE_MONTH-$DATE_DAY-$DATE_HOUR-$DATE_MINUTE-$DATE_SECOND"
|
||||||
|
MOST_RECENT_COMMIT="$(git rev-parse HEAD)"
|
||||||
|
|
||||||
|
git tag $VERSION_TAG_DATE $MOST_RECENT_COMMIT
|
||||||
|
|
||||||
|
git push --all
|
||||||
|
git push --tags
|
||||||
|
|
||||||
|
cd $OPEN_PWD
|
Loading…
Reference in New Issue
Block a user