#!/bin/sh
# operates various maintenance tasks on urls and cache.db for newsboat:
# - first pass: convert http to https in urls file when https feed is available
# - second pass: convert feeds urls to their new forwarded urls for 3xx http codes
# feeds urls not returning xml content are tagged "_fail"
# cache.db is also updated to refer to the new url feeds so all read/unread articles and flags are preserved
# urls file and cache.db are automatically backed up with timestamp before proceeding

# TODO
# address remaining feedback from https://github.com/newsboat/newsboat/pull/647
# implement additional checks on active feeds:
# 	https://www.linuxjournal.com/content/parsing-rss-news-feed-bash-script
#	is it returning valid rss?
#	when was the feed last updated?
#	sort valid feeds by last updated
#	tag feed "abandoned" when most recent pubdate is more 1 year old


#newsboat urls file and cache locations
u="$HOME/.config/newsboat/urls"
db="$HOME/.local/share/newsboat/cache.db"
#curl timeout for URLs probing
timeout=20
tagfail="_fail"
useragent="Lynx/2.8.5rel.1 libwww-FM/2.14"
#where to dump headers
rss="/tmp/newsboat-rss.tmp"
headers="/tmp/newsboat-headers.tmp"

# shuf (GNU coreutils) randomises the urls list, this avoids querying the same domains too fast, assuming urls are grouped by domains or alphasorted in the urls file
requirements="newsboat curl sqlite3 sed grep awk head shuf"
for app in $requirements
do
	command -v "$app" >/dev/null 2>&1 || { echo >&2 "$app is required but it's not installed or it's not in your PATH. Aborting."; exit 1; }
done

if [ ! -f "$u" ]; then
	echo "$u not found. edit the path/filename for urls file"; exit
fi
if [ ! -f "$db" ]; then
	echo "$db not found. edit the path/filename for cache.db"; exit
fi
if [ -f "$db.lock" ]; then
	echo "newsboat is still running. Stop it first then try again"; exit
fi

cp "$db" "$db.bak-$(date +%FT%T)"
cp "$u" "$u.bak-$(date +%FT%T)"

_replace () {
	response=$(curl -A "$useragent" --connect-timeout "$timeout" --max-time "$timeout" --write-out "%{http_code}" --silent -D "$headers" --output "$rss" "$url2")
	if [ "$response" = "200" ]; then
		if grep -qiE "content-type: .*xml" "$headers"; then
			#escape any & found in url, this is a special character in sed
			url2=$( echo "$url2" | sed -e 's/[&\\/]/\\&/g' )
			sed -i "s,^$url,$url2," "$u"
			sqlite3 "$db" "update rss_feed set rssurl='$url2' where rssurl = '$url' ; update rss_item set feedurl='$url2' where feedurl='$url'"
		else
			echo "            not replacing that feed url because feed reply is not recognised as rss content"
		fi
	else
		echo "            not replacing that feed url because feed reply code is not 200"
	fi
	[ -f "$headers" ] && rm "$headers"
	[ -f "$rss" ] && rm "$rss"
}

# replace http with https in feeds urls
feeds=$(grep -cE "^http:" "$u")
i=0
for url in $(shuf "$u" | grep -E "^http:" | awk '{print $1}')
do
	i=$((i+1))
	url2=$(echo "$url" | sed 's/http:/https:/')
	printf "\r\e[K%s/%s %s\n" "$i" "$feeds" "$url"
	_replace
done

# check all feeds return valid http codes
feeds=$(grep -cE "^http" "$u")
i=0
for url in $(shuf "$u" | grep -E "^http" | awk '{print $1}')
do
	i=$((i+1))
	#clear the line before echoing over it
	printf "\r\e[K%s/%s %s\r" "$i" "$feeds" "$url"
	#echo -ne "\r\e[K$i/$feeds $url\r"
	response=$(curl -A "$useragent" --connect-timeout "$timeout" --max-time "$timeout" --write-out "%{http_code}" --silent -D "$headers" --output "$rss" "$url")
	echo curl -A "$useragent" --connect-timeout "$timeout" --max-time "$timeout" --write-out "%{http_code}" --silent -D "$headers" --output "$rss" "$url"
	case "$response" in
		3*)
			#url2=$(curl -A "$useragent" -IL --silent "$url" | awk '/^[lL]ocation: /{ print $2 }' | head -1 | sed 's/\r//g')
			echo "$response [ https://httpstatuses.com/$response ] $url"
			url2=$(awk '/^[lL]ocation: /{ print $2 }' "$headers" | head -1 | sed 's/\r//g')
			case "$url2" in
				http*)
					echo "                            moved to $url2"
					_replace
					;;
				/*)
					domain=$(echo "$url" | awk -F/ '{printf("%s//%s",$1,$3)}')
					url2="$domain$url2"
					echo "                            moved to $url2"
					_replace
					;;
				*)
					printf "\n"
					echo "not replacing that feed url because new feed URL is invalid or incomplete"
					;;
			esac
			;;
		429)
			#oops hammering too many requests
			#uncomment and adjust the sleep timer below if randomising the feeds sequence was not enough to avoid "429" codes replies
			#sleep 60
			;;
		200)
			# feed OK nothing to do
			;;
		*)
			#everything else i.e. 000, 4xx and 5xx could be tagged _fail
			#some 2xx http codes might return valid rss feeds?
			printf "\n"
			#echo "$response [ https://httpstatuses.com/$response ] $url may have problems"
			echo "$response [ https://httpstatuses.com/$response ] $url adding tag: $tagfail"
			if [ ! "$(grep -cE "^$url $tagfail" "$u")" = 1 ]; then
				fail tagging disabled for now
				sed -i "s,$url,$url $tagfail," "$u"
			fi
			;;
	esac
done