diff options
| author | Maciej Sobkowski <maciej@sobkow.ski> | 2025-05-19 21:28:36 +0200 |
|---|---|---|
| committer | Maciej Sobkowski <maciej@sobkow.ski> | 2025-05-19 21:28:36 +0200 |
| commit | 68ebca0edfcea5474805dc2eb635489c8c8b6d43 (patch) | |
| tree | 318b17582c446bba741ff548e26e65e6e6c1537b /.local/bin | |
| parent | 7a33857c89300344de98712deceb103d7287e4fd (diff) | |
.local/bin/newsboat_urls_maintenance.sh: copied from newsboat contrib
Diffstat (limited to '.local/bin')
| -rwxr-xr-x | .local/bin/newsboat_urls_maintenance.sh | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/.local/bin/newsboat_urls_maintenance.sh b/.local/bin/newsboat_urls_maintenance.sh new file mode 100755 index 0000000..4fe9b38 --- /dev/null +++ b/.local/bin/newsboat_urls_maintenance.sh @@ -0,0 +1,133 @@ +#!/bin/sh +# operates various maintenance tasks on urls and cache.db for newsboat: +# - first pass: convert http to https in urls file when https feed is available +# - second pass: convert feeds urls to their new forwarded urls for 3xx http codes +# feeds urls not returning xml content are tagged "_fail" +# cache.db is also updated to refer to the new url feeds so all read/unread articles and flags are preserved +# urls file and cache.db are automatically backed up with timestamp before proceeding + +# TODO +# address remaining feedback from https://github.com/newsboat/newsboat/pull/647 +# implement additional checks on active feeds: +# https://www.linuxjournal.com/content/parsing-rss-news-feed-bash-script +# is it returning valid rss? +# when was the feed last updated? +# sort valid feeds by last updated +# tag feed "abandoned" when most recent pubdate is more 1 year old + + +#newsboat urls file and cache locations +u="$HOME/.config/newsboat/urls" +db="$HOME/.local/share/newsboat/cache.db" +#curl timeout for URLs probing +timeout=20 +tagfail="_fail" +useragent="Lynx/2.8.5rel.1 libwww-FM/2.14" +#where to dump headers +rss="/tmp/newsboat-rss.tmp" +headers="/tmp/newsboat-headers.tmp" + +# shuf (GNU coreutils) randomises the urls list, this avoids querying the same domains too fast, assuming urls are grouped by domains or alphasorted in the urls file +requirements="newsboat curl sqlite3 sed grep awk head shuf" +for app in $requirements +do + command -v "$app" >/dev/null 2>&1 || { echo >&2 "$app is required but it's not installed or it's not in your PATH. Aborting."; exit 1; } +done + +if [ ! -f "$u" ]; then + echo "$u not found. edit the path/filename for urls file"; exit +fi +if [ ! -f "$db" ]; then + echo "$db not found. edit the path/filename for cache.db"; exit +fi +if [ -f "$db.lock" ]; then + echo "newsboat is still running. Stop it first then try again"; exit +fi + +cp "$db" "$db.bak-$(date +%FT%T)" +cp "$u" "$u.bak-$(date +%FT%T)" + +_replace () { + response=$(curl -A "$useragent" --connect-timeout "$timeout" --max-time "$timeout" --write-out "%{http_code}" --silent -D "$headers" --output "$rss" "$url2") + if [ "$response" = "200" ]; then + if grep -qiE "content-type: .*xml" "$headers"; then + #escape any & found in url, this is a special character in sed + url2=$( echo "$url2" | sed -e 's/[&\\/]/\\&/g' ) + sed -i "s,^$url,$url2," "$u" + sqlite3 "$db" "update rss_feed set rssurl='$url2' where rssurl = '$url' ; update rss_item set feedurl='$url2' where feedurl='$url'" + else + echo " not replacing that feed url because feed reply is not recognised as rss content" + fi + else + echo " not replacing that feed url because feed reply code is not 200" + fi + [ -f "$headers" ] && rm "$headers" + [ -f "$rss" ] && rm "$rss" +} + +# replace http with https in feeds urls +feeds=$(grep -cE "^http:" "$u") +i=0 +for url in $(shuf "$u" | grep -E "^http:" | awk '{print $1}') +do + i=$((i+1)) + url2=$(echo "$url" | sed 's/http:/https:/') + printf "\r\e[K%s/%s %s\n" "$i" "$feeds" "$url" + _replace +done + +# check all feeds return valid http codes +feeds=$(grep -cE "^http" "$u") +i=0 +for url in $(shuf "$u" | grep -E "^http" | awk '{print $1}') +do + i=$((i+1)) + #clear the line before echoing over it + printf "\r\e[K%s/%s %s\r" "$i" "$feeds" "$url" + #echo -ne "\r\e[K$i/$feeds $url\r" + response=$(curl -A "$useragent" --connect-timeout "$timeout" --max-time "$timeout" --write-out "%{http_code}" --silent -D "$headers" --output "$rss" "$url") + echo curl -A "$useragent" --connect-timeout "$timeout" --max-time "$timeout" --write-out "%{http_code}" --silent -D "$headers" --output "$rss" "$url" + case "$response" in + 3*) + #url2=$(curl -A "$useragent" -IL --silent "$url" | awk '/^[lL]ocation: /{ print $2 }' | head -1 | sed 's/\r//g') + echo "$response [ https://httpstatuses.com/$response ] $url" + url2=$(awk '/^[lL]ocation: /{ print $2 }' "$headers" | head -1 | sed 's/\r//g') + case "$url2" in + http*) + echo " moved to $url2" + _replace + ;; + /*) + domain=$(echo "$url" | awk -F/ '{printf("%s//%s",$1,$3)}') + url2="$domain$url2" + echo " moved to $url2" + _replace + ;; + *) + printf "\n" + echo "not replacing that feed url because new feed URL is invalid or incomplete" + ;; + esac + ;; + 429) + #oops hammering too many requests + #uncomment and adjust the sleep timer below if randomising the feeds sequence was not enough to avoid "429" codes replies + #sleep 60 + ;; + 200) + # feed OK nothing to do + ;; + *) + #everything else i.e. 000, 4xx and 5xx could be tagged _fail + #some 2xx http codes might return valid rss feeds? + printf "\n" + #echo "$response [ https://httpstatuses.com/$response ] $url may have problems" + echo "$response [ https://httpstatuses.com/$response ] $url adding tag: $tagfail" + if [ ! "$(grep -cE "^$url $tagfail" "$u")" = 1 ]; then + fail tagging disabled for now + sed -i "s,$url,$url $tagfail," "$u" + fi + ;; + esac +done + |
