Add a script for downloading all the episodes in an RSS feed
authorAlex Chan <alex@alexwlchan.net>
Sun, 12 Jan 2020 13:20:51 +0000 (13:20 +0000)
committerAlex Chan <alex@alexwlchan.net>
Sun, 12 Jan 2020 13:21:01 +0000 (13:21 +0000)
Closes #2

README.md
download_all_episodes_from_rss.py [new file with mode: 0755]
download_overcast_podcasts.py

index 05f7a93..5e298e7 100644 (file)
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ You need:
     If you haven't done this before, or you've forgotten your email/password, read [my instructions](add_email_password) for doing so.
 
 *   **A working Python 3 installation.**
-    This script only works with Python 3.
+    This script only works with Python 3.6 or later.
     You need to install dependencies with pip:
 
     ```console
diff --git a/download_all_episodes_from_rss.py b/download_all_episodes_from_rss.py
new file mode 100755 (executable)
index 0000000..a6ccb17
--- /dev/null
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+"""
+The main downloader script will also get a copy of the RSS feed.
+
+If there are episodes in the RSS feeds that you haven't listened to in Overcast,
+but you still want in your podcast archive (for example, if you listened to them
+in a different podcast app), you can use this script to download them all.
+"""
+
+import glob
+import html
+import os
+import sys
+
+from lxml import etree
+import smartypants
+
+from download_overcast_podcasts import download_url, get_filename, logger
+
+
+def download_files_for_xml(xml_path):
+    logger.info("Inspecting %r", xml_path)
+    tree = etree.parse(xml_path)
+
+    download_dir = os.path.dirname(xml_path)
+
+    for item in tree.xpath(".//item"):
+        title = item.find("title").text
+        logger.debug("Checking episode %r", title)
+
+        audio_url = item.find("enclosure").attrib["url"]
+
+        filename = get_filename(
+            download_url=audio_url,
+            # We have to replicate some of the processing done by Overcast's
+            # title cleanups.
+            title=html.unescape(smartypants.smartypants(title)),
+        )
+        download_path = os.path.join(download_dir, filename)
+
+        if os.path.exists(download_path):
+            logger.debug("This episode is already downloaded, skipping")
+            continue
+
+        logger.info("Downloading episode %r", title)
+
+        download_url(url=audio_url, path=download_path, description="audio file")
+
+
+if __name__ == "__main__":
+    try:
+        audiofile_dir = sys.argv[1]
+    except IndexError:
+        sys.exit(f"{__file__} <AUDIOFILE_DIR>")
+
+    for xml_path in glob.iglob(os.path.join(audiofile_dir, "feed.*.xml")):
+        download_files_for_xml(xml_path)
index 955d1ce..5495a1b 100755 (executable)
@@ -39,17 +39,9 @@ def parse_args(argv):
     )
 
     parser.add_argument(
-        "--download_dir", default="audiofiles",
-        help="directory to save podcast information to to"
-    )
-
-    parser.add_argument(
-        "--user_agent", default="Python-urllib/%d.%d" % sys.version_info[:2],
-        help="""
-        user-agent to send in requests.  Some sites return a 403 Error if you try
-        to download files with urllib.  You could use (for example) 'Mozilla/5.0',
-        which might get files which otherwise fail to download.
-        """
+        "--download_dir",
+        default="audiofiles",
+        help="directory to save podcast information to to",
     )
 
     args = parser.parse_args(argv)
@@ -57,7 +49,6 @@ def parse_args(argv):
     return {
         "opml_path": os.path.abspath(args.OPML_PATH),
         "download_dir": os.path.abspath(args.download_dir),
-        "user_agent": args.user_agent,
     }
 
 
@@ -137,6 +128,31 @@ def _escape(s):
     return s.replace(":", "-").replace("/", "-")
 
 
+def get_filename(*, download_url, title):
+    url_path = urlparse(download_url).path
+
+    extension = os.path.splitext(url_path)[-1]
+    base_name = _escape(title)
+
+    return base_name + extension
+
+
+def download_url(*, url, path, description):
+    # Some sites block the default urllib User-Agent headers, so we can customise
+    # it to something else if necessary.
+    opener = build_opener()
+    opener.addheaders = [("User-agent", "Mozilla/5.0")]
+    install_opener(opener)
+
+    try:
+        tmp_path, _ = urlretrieve(url)
+    except Exception as err:
+        logger.error(f"Error downloading {description}: {err}")
+    else:
+        logger.info(f"Downloading {description} successful!")
+        os.rename(tmp_path, path)
+
+
 def download_episode(episode, download_dir):
     """
     Given a blob of episode data from get_episodes, download the MP3 file and
@@ -146,16 +162,12 @@ def download_episode(episode, download_dir):
     # title is "Episode 1: My Great Podcast", the filename is
     # ``Episode 1- My Great Podcast.mp3``.
     audio_url = episode["episode"]["enclosure_url"]
-    url_path = urlparse(audio_url).path
-
-    extension = os.path.splitext(url_path)[-1]
-    base_name = _escape(episode["episode"]["title"])
 
-    filename = base_name + extension
+    filename = get_filename(download_url=audio_url, title=episode["episode"]["title"])
 
     # Within the download_dir, put the episodes for each podcast in the
     # same folder.
-    podcast_dir = os.path.join(download_dir, _escape(episode["podcast"]["title"]))
+    podcast_dir = os.path.join(download_dir, escape(episode["podcast"]["title"]))
     mkdir_p(podcast_dir)
 
     # Download the podcast audio file if it hasn't already been downloaded.
@@ -186,13 +198,7 @@ def download_episode(episode, download_dir):
         logger.info(
             "Downloading %s: %s to %s", episode["podcast"]["title"], audio_url, filename
         )
-        try:
-            tmp_path, _ = urlretrieve(audio_url)
-        except Exception as err:
-            logger.error("Error downloading audio file: %s", err)
-        else:
-            logger.info("Download successful!")
-            os.rename(tmp_path, download_path)
+        download_url(url=audio_url, path=download_path, description="MP3 file")
 
     # Save a blob of JSON with some episode metadata
     episode["filename"] = filename
@@ -206,7 +212,7 @@ def download_episode(episode, download_dir):
 
 
 def save_rss_feed(*, episode, download_dir):
-    podcast_dir = os.path.join(download_dir, _escape(episode["podcast"]["title"]))
+    podcast_dir = os.path.join(download_dir, escape(episode["podcast"]["title"]))
 
     today = datetime.datetime.now().strftime("%Y-%m-%d")
 
@@ -216,13 +222,9 @@ def save_rss_feed(*, episode, download_dir):
         return
 
     logger.info("Downloading RSS feed for %s", episode["podcast"]["title"])
-    try:
-        tmp_path, _ = urlretrieve(episode["podcast"]["xml_url"])
-    except Exception as err:
-        logger.error("Error downloading RSS feed: %s", err)
-    else:
-        logger.info("Downloaded RSS successfully!")
-        os.rename(tmp_path, rss_path)
+    download_url(
+        url=episode["podcast"]["xml_url"], path=rss_path, description="RSS feed"
+    )
 
 
 if __name__ == "__main__":
@@ -231,12 +233,6 @@ if __name__ == "__main__":
     opml_path = args["opml_path"]
     download_dir = args["download_dir"]
 
-    # Some sites block the default urllib User-Agent headers, so we can customise
-    # it to something else if necessary.
-    opener = build_opener()
-    opener.addheaders = [("User-agent", args["user_agent"])]
-    install_opener(opener)
-
     try:
         with open(opml_path) as infile:
             xml_string = infile.read()