aboutsummaryrefslogtreecommitdiff
path: root/.local/bin
diff options
context:
space:
mode:
authorMaciej Sobkowski <maciej@sobkow.ski>2025-05-19 21:28:36 +0200
committerMaciej Sobkowski <maciej@sobkow.ski>2025-05-19 21:28:36 +0200
commit68ebca0edfcea5474805dc2eb635489c8c8b6d43 (patch)
tree318b17582c446bba741ff548e26e65e6e6c1537b /.local/bin
parent7a33857c89300344de98712deceb103d7287e4fd (diff)
.local/bin/newsboat_urls_maintenance.sh: copied from newsboat contrib
Diffstat (limited to '.local/bin')
-rwxr-xr-x.local/bin/newsboat_urls_maintenance.sh133
1 files changed, 133 insertions, 0 deletions
diff --git a/.local/bin/newsboat_urls_maintenance.sh b/.local/bin/newsboat_urls_maintenance.sh
new file mode 100755
index 0000000..4fe9b38
--- /dev/null
+++ b/.local/bin/newsboat_urls_maintenance.sh
@@ -0,0 +1,133 @@
+#!/bin/sh
+# operates various maintenance tasks on urls and cache.db for newsboat:
+# - first pass: convert http to https in urls file when https feed is available
+# - second pass: convert feeds urls to their new forwarded urls for 3xx http codes
+# feeds urls not returning xml content are tagged "_fail"
+# cache.db is also updated to refer to the new url feeds so all read/unread articles and flags are preserved
+# urls file and cache.db are automatically backed up with timestamp before proceeding
+
+# TODO
+# address remaining feedback from https://github.com/newsboat/newsboat/pull/647
+# implement additional checks on active feeds:
+# https://www.linuxjournal.com/content/parsing-rss-news-feed-bash-script
+# is it returning valid rss?
+# when was the feed last updated?
+# sort valid feeds by last updated
+# tag feed "abandoned" when most recent pubdate is more 1 year old
+
+
+#newsboat urls file and cache locations
+u="$HOME/.config/newsboat/urls"
+db="$HOME/.local/share/newsboat/cache.db"
+#curl timeout for URLs probing
+timeout=20
+tagfail="_fail"
+useragent="Lynx/2.8.5rel.1 libwww-FM/2.14"
+#where to dump headers
+rss="/tmp/newsboat-rss.tmp"
+headers="/tmp/newsboat-headers.tmp"
+
+# shuf (GNU coreutils) randomises the urls list, this avoids querying the same domains too fast, assuming urls are grouped by domains or alphasorted in the urls file
+requirements="newsboat curl sqlite3 sed grep awk head shuf"
+for app in $requirements
+do
+ command -v "$app" >/dev/null 2>&1 || { echo >&2 "$app is required but it's not installed or it's not in your PATH. Aborting."; exit 1; }
+done
+
+if [ ! -f "$u" ]; then
+ echo "$u not found. edit the path/filename for urls file"; exit
+fi
+if [ ! -f "$db" ]; then
+ echo "$db not found. edit the path/filename for cache.db"; exit
+fi
+if [ -f "$db.lock" ]; then
+ echo "newsboat is still running. Stop it first then try again"; exit
+fi
+
+cp "$db" "$db.bak-$(date +%FT%T)"
+cp "$u" "$u.bak-$(date +%FT%T)"
+
+_replace () {
+ response=$(curl -A "$useragent" --connect-timeout "$timeout" --max-time "$timeout" --write-out "%{http_code}" --silent -D "$headers" --output "$rss" "$url2")
+ if [ "$response" = "200" ]; then
+ if grep -qiE "content-type: .*xml" "$headers"; then
+ #escape any & found in url, this is a special character in sed
+ url2=$( echo "$url2" | sed -e 's/[&\\/]/\\&/g' )
+ sed -i "s,^$url,$url2," "$u"
+ sqlite3 "$db" "update rss_feed set rssurl='$url2' where rssurl = '$url' ; update rss_item set feedurl='$url2' where feedurl='$url'"
+ else
+ echo " not replacing that feed url because feed reply is not recognised as rss content"
+ fi
+ else
+ echo " not replacing that feed url because feed reply code is not 200"
+ fi
+ [ -f "$headers" ] && rm "$headers"
+ [ -f "$rss" ] && rm "$rss"
+}
+
+# replace http with https in feeds urls
+feeds=$(grep -cE "^http:" "$u")
+i=0
+for url in $(shuf "$u" | grep -E "^http:" | awk '{print $1}')
+do
+ i=$((i+1))
+ url2=$(echo "$url" | sed 's/http:/https:/')
+ printf "\r\e[K%s/%s %s\n" "$i" "$feeds" "$url"
+ _replace
+done
+
+# check all feeds return valid http codes
+feeds=$(grep -cE "^http" "$u")
+i=0
+for url in $(shuf "$u" | grep -E "^http" | awk '{print $1}')
+do
+ i=$((i+1))
+ #clear the line before echoing over it
+ printf "\r\e[K%s/%s %s\r" "$i" "$feeds" "$url"
+ #echo -ne "\r\e[K$i/$feeds $url\r"
+ response=$(curl -A "$useragent" --connect-timeout "$timeout" --max-time "$timeout" --write-out "%{http_code}" --silent -D "$headers" --output "$rss" "$url")
+ echo curl -A "$useragent" --connect-timeout "$timeout" --max-time "$timeout" --write-out "%{http_code}" --silent -D "$headers" --output "$rss" "$url"
+ case "$response" in
+ 3*)
+ #url2=$(curl -A "$useragent" -IL --silent "$url" | awk '/^[lL]ocation: /{ print $2 }' | head -1 | sed 's/\r//g')
+ echo "$response [ https://httpstatuses.com/$response ] $url"
+ url2=$(awk '/^[lL]ocation: /{ print $2 }' "$headers" | head -1 | sed 's/\r//g')
+ case "$url2" in
+ http*)
+ echo " moved to $url2"
+ _replace
+ ;;
+ /*)
+ domain=$(echo "$url" | awk -F/ '{printf("%s//%s",$1,$3)}')
+ url2="$domain$url2"
+ echo " moved to $url2"
+ _replace
+ ;;
+ *)
+ printf "\n"
+ echo "not replacing that feed url because new feed URL is invalid or incomplete"
+ ;;
+ esac
+ ;;
+ 429)
+ #oops hammering too many requests
+ #uncomment and adjust the sleep timer below if randomising the feeds sequence was not enough to avoid "429" codes replies
+ #sleep 60
+ ;;
+ 200)
+ # feed OK nothing to do
+ ;;
+ *)
+ #everything else i.e. 000, 4xx and 5xx could be tagged _fail
+ #some 2xx http codes might return valid rss feeds?
+ printf "\n"
+ #echo "$response [ https://httpstatuses.com/$response ] $url may have problems"
+ echo "$response [ https://httpstatuses.com/$response ] $url adding tag: $tagfail"
+ if [ ! "$(grep -cE "^$url $tagfail" "$u")" = 1 ]; then
+ fail tagging disabled for now
+ sed -i "s,$url,$url $tagfail," "$u"
+ fi
+ ;;
+ esac
+done
+