Fix cross-device links
[overcast-downloader] / download_overcast_podcasts.py
1 #!/usr/bin/env python
2 # -*- encoding: utf-8
3 """
4 Download podcast files based on your Overcast export.
5
6 If you have an Overcast account, you can download an OPML file with
7 a list of every episode you've played from https://overcast.fm/account.
8
9 This tool can read that OPML file, and save a local copy of the audio files
10 for every episode you've listened to.
11 """
12
13 import argparse
14 import datetime
15 import errno
16 import filecmp
17 import functools
18 import glob
19 import itertools
20 import json
21 import os
22 import sqlite3
23 import sys
24 from urllib.parse import urlparse
25 from urllib.request import build_opener, install_opener, urlretrieve
26 import xml.etree.ElementTree as ET
27
28
29 def parse_args(argv):
30     """Parse command-line arguments."""
31     parser = argparse.ArgumentParser(description=__doc__)
32
33     parser.add_argument(
34         "OPML_PATH",
35         help="Path to an OPML file downloaded from https://overcast.fm/account",
36     )
37
38     parser.add_argument(
39         "--download_dir",
40         default="audiofiles",
41         help="directory to save podcast information to to",
42     )
43
44     args = parser.parse_args(argv)
45
46     return {
47         "opml_path": os.path.abspath(args.OPML_PATH),
48         "download_dir": os.path.abspath(args.download_dir),
49     }
50
51
52 def get_episodes(xml_string):
53     """
54     Given the XML string of the Overcast OPML, generate a sequence of entries
55     that represent a single, played podcast episode.
56     """
57     root = ET.fromstring(xml_string)
58
59     # The Overcast OPML has the following form:
60     #
61     #   <?xml version="1.0" encoding="utf-8"?>
62     #   <opml version="1.0">
63     #       <head><title>Overcast Podcast Subscriptions</title></head>
64     #       <body>
65     #           <outline text="playlists">...</outline>
66     #           <outline text="feeds">...</outline>
67     #       </body>
68     #   </opml>
69     #
70     # Within the <outline text="feeds"> block of XML, there's a list of feeds
71     # with the following structure (some attributes omitted):
72     #
73     #   <outline type="rss"
74     #            title="My Example Podcast"
75     #            xmlUrl="https://example.org/podcast.xml">
76     #       <outline type="podcast-episode"
77     #                overcastId="12345"
78     #                pubDate="2001-01-01T01:01:01-00:00"
79     #                title="The first episode"
80     #                url="https://example.net/podcast/1"
81     #                overcastUrl="https://overcast.fm/+ABCDE"
82     #                enclosureUrl="https://example.net/files/1.mp3"/>
83     #       ...
84     #   </outline>
85     #
86     # We use an XPath expression to find the <outline type="rss"> entries
87     # (so we get the podcast metadata), and then find the individual
88     # "podcast-episode" entries in that feed.
89
90     for feed in root.findall("./body/outline[@text='feeds']/outline[@type='rss']"):
91         podcast = {
92             "title": feed.get("title"),
93             "text": feed.get("text"),
94             "xml_url": feed.get("xmlUrl"),
95         }
96
97         for episode_xml in feed.findall("./outline[@type='podcast-episode']"):
98             episode = {
99                 "published_date": episode_xml.get("pubDate"),
100                 "title": episode_xml.get("title"),
101                 "url": episode_xml.get("url"),
102                 "overcast_id": episode_xml.get("overcastId"),
103                 "overcast_url": episode_xml.get("overcastUrl"),
104                 "enclosure_url": episode_xml.get("enclosureUrl"),
105             }
106
107             yield {
108                 "podcast": podcast,
109                 "episode": episode,
110             }
111
112
113 def has_episode_been_downloaded_already(episode, download_dir):
114     try:
115         conn = sqlite3.connect(os.path.join(download_dir, "overcast.db"))
116     except sqlite3.OperationalError as err:
117         if err.args[0] == "unable to open database file":
118             return False
119         else:
120             raise
121
122     c = conn.cursor()
123
124     try:
125         c.execute(
126             "SELECT * FROM downloaded_episodes WHERE overcast_id=?",
127             (episode["episode"]["overcast_id"],),
128         )
129     except sqlite3.OperationalError as err:
130         if err.args[0] == "no such table: downloaded_episodes":
131             return False
132         else:
133             raise
134
135     return c.fetchone() is not None
136
137
138 def mark_episode_as_downloaded(episode, download_dir):
139     conn = sqlite3.connect(os.path.join(download_dir, "overcast.db"))
140     c = conn.cursor()
141
142     try:
143         c.execute("CREATE TABLE downloaded_episodes (overcast_id text PRIMARY KEY)")
144     except sqlite3.OperationalError as err:
145         if err.args[0] == "table downloaded_episodes already exists":
146             pass
147         else:
148             raise
149
150     c.execute(
151         "INSERT INTO downloaded_episodes VALUES (?)",
152         (episode["episode"]["overcast_id"],),
153     )
154     conn.commit()
155     conn.close()
156
157
158 def _escape(s):
159     return s.replace(":", "-").replace("/", "-")
160
161
162 def get_filename(*, download_url, title):
163     url_path = urlparse(download_url).path
164
165     extension = os.path.splitext(url_path)[-1]
166     base_name = _escape(title)
167
168     return base_name + extension
169
170
171 def download_url(*, url, path, description):
172     # Some sites block the default urllib User-Agent headers, so we can customise
173     # it to something else if necessary.
174     opener = build_opener()
175     opener.addheaders = [("User-agent", "Mozilla/5.0")]
176     install_opener(opener)
177
178     try:
179         tmp_path, _ = urlretrieve(url)
180     except Exception as err:
181         print(f"Error downloading {description}: {err}")
182     else:
183         print(f"Downloading {description} successful!")
184         os.rename(tmp_path, path)
185
186
187 def download_episode(episode, download_dir):
188     """
189     Given a blob of episode data from get_episodes, download the MP3 file and
190     save the metadata to ``download_dir``.
191     """
192     if has_episode_been_downloaded_already(episode=episode, download_dir=download_dir):
193         return
194
195     # If the MP3 URL is https://example.net/mypodcast/podcast1.mp3 and the
196     # title is "Episode 1: My Great Podcast", the filename is
197     # ``Episode 1- My Great Podcast.mp3``.
198     audio_url = episode["episode"]["enclosure_url"]
199
200     filename = get_filename(download_url=audio_url, title=episode["episode"]["title"])
201
202     # Within the download_dir, put the episodes for each podcast in the
203     # same folder.
204     podcast_dir = os.path.join(download_dir, _escape(episode["podcast"]["title"]))
205     os.makedirs(podcast_dir, exist_ok=True)
206
207     # Download the podcast audio file if it hasn't already been downloaded.
208     download_path = os.path.join(podcast_dir, filename)
209     base_name = _escape(episode["episode"]["title"])
210     json_path = os.path.join(podcast_dir, base_name + ".json")
211
212     # If the MP3 file already exists, check to see if it's the same episode,
213     # or if this podcast isn't using unique filenames.
214     #
215     # If a podcast has multiple episodes with the same filename in its feed,
216     # append the Overcast ID to disambiguate.
217     if os.path.exists(download_path):
218         try:
219             cached_metadata = json.load(open(json_path, "r"))
220         except Exception as err:
221             print(err, json_path)
222             raise
223
224         cached_overcast_id = cached_metadata["episode"]["overcast_id"]
225         this_overcast_id = episode["episode"]["overcast_id"]
226
227         if cached_overcast_id != this_overcast_id:
228             filename = filename.replace(".mp3", "_%s.mp3" % this_overcast_id)
229             old_download_path = download_path
230             download_path = os.path.join(podcast_dir, filename)
231             json_path = download_path + ".json"
232
233             print(
234                 "Downloading %s: %s to %s"
235                 % (episode["podcast"]["title"], audio_url, filename)
236             )
237             download_url(url=audio_url, path=download_path, description=audio_url)
238
239             try:
240                 if filecmp.cmp(download_path, old_download_path, shallow=False):
241                     print("Duplicates detected! %s" % download_path)
242                     os.unlink(download_path)
243                     download_path = old_download_path
244             except FileNotFoundError:
245                 # This can occur if the download fails -- say, the episode is
246                 # in the Overcast catalogue, but no longer available from source.
247                 pass
248
249         else:
250             # Already downloaded and it's the same episode.
251             pass
252
253     # This episode has never been downloaded before, so we definitely have
254     # to download it fresh.
255     else:
256         print(
257             "Downloading %s: %s to %s"
258             % (episode["podcast"]["title"], audio_url, filename)
259         )
260         download_url(url=audio_url, path=download_path, description=audio_url)
261
262     # Save a blob of JSON with some episode metadata
263     episode["filename"] = filename
264
265     json_string = json.dumps(episode, indent=2, sort_keys=True)
266
267     with open(json_path, "w") as outfile:
268         outfile.write(json_string)
269
270     save_rss_feed(episode=episode, download_dir=download_dir)
271     mark_episode_as_downloaded(episode=episode, download_dir=download_dir)
272
273
274 def save_rss_feed(*, episode, download_dir):
275     _save_rss_feed(
276         title=episode["podcast"]["title"],
277         xml_url=episode["podcast"]["xml_url"],
278         download_dir=download_dir
279     )
280
281
282 # Use caching so we only have to download this RSS feed once.
283 @functools.lru_cache()
284 def _save_rss_feed(*, title, xml_url, download_dir):
285     podcast_dir = os.path.join(download_dir, _escape(title))
286
287     today = datetime.datetime.now().strftime("%Y-%m-%d")
288
289     rss_path = os.path.join(podcast_dir, f"feed.{today}.xml")
290
291     if not os.path.exists(rss_path):
292         print("Downloading RSS feed for %s" % title)
293         download_url(
294             url=xml_url,
295             path=rss_path,
296             description="RSS feed for %s" % title,
297         )
298
299     matching_feeds = sorted(glob.glob(os.path.join(podcast_dir, "feed.*.xml")))
300
301     while (
302         len(matching_feeds) >= 2 and
303         filecmp.cmp(matching_feeds[-2], matching_feeds[-1], shallow=False)
304     ):
305         os.unlink(matching_feeds[-1])
306         matching_feeds.remove(matching_feeds[-1])
307
308
309 if __name__ == "__main__":
310     args = parse_args(argv=sys.argv[1:])
311
312     opml_path = args["opml_path"]
313     download_dir = args["download_dir"]
314
315     try:
316         with open(opml_path) as infile:
317             xml_string = infile.read()
318     except OSError as err:
319         if err.errno == errno.ENOENT:
320             sys.exit("Could not find an OPML file at %s" % opml_path)
321         else:
322             raise
323
324     for episode in get_episodes(xml_string):
325         download_episode(episode, download_dir=download_dir)