From 696580d6696600e9669ca0bbd55199fea5edd2db Mon Sep 17 00:00:00 2001
From: Julien Moutinho <julm+julm-nix@sourcephile.fr>
Date: Fri, 8 Apr 2022 05:06:43 +0200
Subject: [PATCH] podl: add HTML support

---
 nixpkgs/overlays/podl.nix |   2 +-
 nixpkgs/overlays/podl.sh  | 151 +++++++++++++++++++++++---------------
 2 files changed, 91 insertions(+), 62 deletions(-)

diff --git a/nixpkgs/overlays/podl.nix b/nixpkgs/overlays/podl.nix
index 44f9e9b..f7116c3 100644
--- a/nixpkgs/overlays/podl.nix
+++ b/nixpkgs/overlays/podl.nix
@@ -2,7 +2,7 @@ self: super: {
   podl = super.writeShellScriptBin "podl" ''
     PATH=${with self; lib.makeBinPath [
       coreutils utillinux findutils gnugrep gnused
-      curl aria2 yt-dlp xmlstarlet
+      curl aria2 yt-dlp xmlstarlet file
     ]}
     ${builtins.readFile ./podl.sh}
   '';
diff --git a/nixpkgs/overlays/podl.sh b/nixpkgs/overlays/podl.sh
index edde9a3..30b49a3 100644
--- a/nixpkgs/overlays/podl.sh
+++ b/nixpkgs/overlays/podl.sh
@@ -87,69 +87,98 @@ while IFS= read -r found; do
      -e 's@.*youtube\.com.*list=\([^&]\+\).*@https://www.youtube.com/feeds/videos.xml?playlist_id=\1@' \
     )
     for feed in $feeds; do
-      export url_xpath="$(look_up .url.xpath cat)"
-      curl -Ls "$feed" |
-      xml select -T \
-       -N atom="http://www.w3.org/2005/Atom" \
-       -N yt="http://www.youtube.com/xml/schemas/2015" \
-       -N mrss="http://search.yahoo.com/mrss/" \
-       -t -m "/rss/channel/item" \
-       -o "title='" -v "translate(translate(title,'\"','_'),\"'/:?&|$IFS\",\"’_____   \")" -o "'" -n \
-       -o "guid='" -v "translate(translate(guid,'\"','_'),\"'$IFS\",\"’   \")" -o "'" -n \
-       -o "url='" -v "translate(${url_xpath:-"enclosure[1]/@url"},\"'$IFS\",\"’   \")" -o "'" -n \
-       -o "published='" -v "translate(pubDate,\"'$IFS\",\"’   \")" -o "'" -n \
-       -o '
-        file=${url##*/}
-        file=${file%%\#*}
-        file=${file%%\?*}
-        # remove leading whitespace characters
-        title="${title#"${title%%[![:space:]]*}"}"
-        # remove trailing whitespace characters
-        title="${title%"${title##*[![:space:]]}"}"
-        test -z "$url" ||
-        grep -qxF -e "url $url" -e "guid $guid" .downloaded || {
-          published=$(date +%Y-%m-%d -d "$published")
-          echo >&2 "$dst/$published - $title"
-          if test ! "${SKIP_DOWNLOAD:+set}"
-          then
-            yt-dlp $YT \
-             --output "$published - ${title//%/%%}.%(ext)s" \
-             "$url"
-          fi
-          { flock --exclusive 3
-            echo >&3 "guid $guid"
-            echo >&3 "url $url"
-          } 3>>.downloaded
-        }
-       ' -n -b \
-       -t -m "/atom:feed/atom:entry[yt:videoId]" \
-       -o "title='" -v "translate(translate(atom:title,'\"','_'),\"'/:?&|$IFS\",\"’_____   \")" -o "'" -n \
-       -o "url='" -v "translate(${url_xpath:-"atom:link[@rel='alternate']/@href"},\"'$IFS\",\"’   \")" -o "'" -n \
-       -o "published='" -v "translate(atom:published,\"'$IFS\",\"’   \")" -o "'" -n \
-       -o "id='" -v "translate(yt:videoId,\"'$IFS\",\"’   \")" -o "'" -n \
-       -o '
-        # remove leading whitespace characters
-        title="${title#"${title%%[![:space:]]*}"}"
-        # remove trailing whitespace characters
-        title="${title%"${title##*[![:space:]]}"}"
-        grep -qxF "youtube $id" .downloaded || {
-          published=$(date +%Y-%m-%d -d "$published")
-          echo >&2 "$dst/$published - $title.$id"
-          if test "${SKIP_DOWNLOAD:+set}"
-          then
-            { flock --exclusive 3
-              echo >&3 "youtube $id"
-            } 3>>.downloaded
-          else
-            yt-dlp $YT \
-             --download-archive .downloaded \
-             --output "$published - ${title//%/%%}.%(id)s.%(format_id)s.%(ext)s" \
-             "$url"
-          fi
-        }
-       ' |
+      (
+      file=$(mktemp)
+      trap "rm -f '$file'" EXIT
+      curl -Ls "$feed" -o "$file"
+      case $(file --mime-type "$file" | cut -f 2 -d :) in
+        (' 'text/html)
+          export html_match_xpath="$(look_up .html.match.xpath cat)"
+          export html_url_xpath="$(look_up .html.url.xpath cat)"
+          xml format --html <"$file" 2>/dev/null |
+          xml select --text \
+           -t -m "${html_match_xpath:-'//a'}" \
+           -o "url='" -v "translate(${html_url_xpath:-"@href"},\"'$IFS\",\"’   \")" -o "'" -n \
+           -o '
+            test -z "$url" ||
+            grep -qxF "url $url" .downloaded || {
+              if test ! "${skip_download:+set}"
+              then
+                yt-dlp $YT "$url"
+              fi
+              { flock --exclusive 3
+                echo >&3 "url $url"
+              } 3>>.downloaded
+            }
+           ' -n
+          ;;
+        (' 'text/xml)
+          export url_xpath="$(look_up .url.xpath cat)"
+          xml select <"$file" --text \
+           -N atom="http://www.w3.org/2005/Atom" \
+           -N yt="http://www.youtube.com/xml/schemas/2015" \
+           -N mrss="http://search.yahoo.com/mrss/" \
+           -t -m "/rss/channel/item" \
+           -o "title='" -v "translate(translate(title,'\"','_'),\"'/:?&|$IFS\",\"’_____   \")" -o "'" -n \
+           -o "guid='" -v "translate(translate(guid,'\"','_'),\"'$IFS\",\"’   \")" -o "'" -n \
+           -o "url='" -v "translate(${url_xpath:-"enclosure[1]/@url"},\"'$IFS\",\"’   \")" -o "'" -n \
+           -o "published='" -v "translate(pubDate,\"'$IFS\",\"’   \")" -o "'" -n \
+           -o '
+            file=${url##*/}
+            file=${file%%\#*}
+            file=${file%%\?*}
+            # remove leading whitespace characters
+            title="${title#"${title%%[![:space:]]*}"}"
+            # remove trailing whitespace characters
+            title="${title%"${title##*[![:space:]]}"}"
+            test -z "$url" ||
+            grep -qxF -e "url $url" -e "guid $guid" .downloaded || {
+              published=$(date +%Y-%m-%d -d "$published")
+              echo >&2 "$dst/$published - $title"
+              if test ! "${SKIP_DOWNLOAD:+set}"
+              then
+                yt-dlp $YT \
+                 --output "$published - ${title//%/%%}.%(ext)s" \
+                 "$url"
+              fi
+              { flock --exclusive 3
+                echo >&3 "guid $guid"
+                echo >&3 "url $url"
+              } 3>>.downloaded
+            }
+           ' -n -b \
+           -t -m "/atom:feed/atom:entry[yt:videoId]" \
+           -o "title='" -v "translate(translate(atom:title,'\"','_'),\"'/:?&|$IFS\",\"’_____   \")" -o "'" -n \
+           -o "url='" -v "translate(${url_xpath:-"atom:link[@rel='alternate']/@href"},\"'$IFS\",\"’   \")" -o "'" -n \
+           -o "published='" -v "translate(atom:published,\"'$IFS\",\"’   \")" -o "'" -n \
+           -o "id='" -v "translate(yt:videoId,\"'$IFS\",\"’   \")" -o "'" -n \
+           -o '
+            # remove leading whitespace characters
+            title="${title#"${title%%[![:space:]]*}"}"
+            # remove trailing whitespace characters
+            title="${title%"${title##*[![:space:]]}"}"
+            grep -qxF "youtube $id" .downloaded || {
+              published=$(date +%Y-%m-%d -d "$published")
+              echo >&2 "$dst/$published - $title.$id"
+              if test "${SKIP_DOWNLOAD:+set}"
+              then
+                { flock --exclusive 3
+                  echo >&3 "youtube $id"
+                } 3>>.downloaded
+              else
+                yt-dlp $YT \
+                 --download-archive .downloaded \
+                 --output "$published - ${title//%/%%}.%(id)s.%(format_id)s.%(ext)s" \
+                 "$url"
+              fi
+            }
+           ';;
+      esac |
       "$SHELL" -seu${XTRACE:+x}
+      )
     done;;
   esac
   )
 done
+
+
-- 
2.47.2