|
@@ -0,0 +1,72 @@
|
|
|
+#!/usr/bin/env -S nix-shell # -*- python -*-
|
|
|
+#!nix-shell -i python3 -p "python3.withPackages(ps: with ps; [ beautifulsoup4 html5lib tqdm ])"
|
|
|
+# coding: utf-8
|
|
|
+
|
|
|
+import os.path as osp
|
|
|
+import sys
|
|
|
+from argparse import ArgumentParser
|
|
|
+
|
|
|
+from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
|
|
|
+from tqdm import tqdm
|
|
|
+
|
|
|
+import warnings
|
|
|
+warnings.filterwarnings(action='ignore',
|
|
|
+ category=XMLParsedAsHTMLWarning,
|
|
|
+ module='bs4')
|
|
|
+
|
|
|
+ap = ArgumentParser(description='Convert Amazon Highlights HTML to an Org file')
|
|
|
+ap.add_argument('-a', '--append',
|
|
|
+ action='store_true',
|
|
|
+ default=False,
|
|
|
+ help='append to output file, do not create (default, create)')
|
|
|
+
|
|
|
+ap.add_argument('-t', '--title',
|
|
|
+ dest='title_format',
|
|
|
+ default='{heading}',
|
|
|
+ help='format string for headers: {title}, {authors} and {heading} are available')
|
|
|
+
|
|
|
+ap.add_argument('-d', '--depth',
|
|
|
+ dest='heading_depth',
|
|
|
+ default=1,
|
|
|
+ type=int,
|
|
|
+ help='how deep are the headings')
|
|
|
+
|
|
|
+ap.add_argument('in_file',
|
|
|
+ type=str,
|
|
|
+ help='input HTML file from Amazon Kindle highlights')
|
|
|
+
|
|
|
+ap.add_argument('out_file',
|
|
|
+ type=str,
|
|
|
+ help='output Org file')
|
|
|
+
|
|
|
+args = ap.parse_args()
|
|
|
+
|
|
|
+heading_stars = '*' * args.heading_depth
|
|
|
+
|
|
|
+if not osp.exists(args.in_file):
|
|
|
+ print(f"File \"{args.in_file}\" does not exist.", file = sys.stderr)
|
|
|
+ sys.exit(1)
|
|
|
+
|
|
|
+with open(args.in_file) as fp:
|
|
|
+ soup = BeautifulSoup(fp, 'html5lib')
|
|
|
+
|
|
|
+title = soup.find("div", "bookTitle").string.strip()
|
|
|
+authors = ' and '.join(list(soup.find("div", "authors").strings)).strip()
|
|
|
+
|
|
|
+note_headings = soup.find_all("div", "noteHeading")
|
|
|
+notes = soup.find_all("div", "noteText")
|
|
|
+
|
|
|
+last_heading = ''
|
|
|
+
|
|
|
+with open(args.out_file, 'a' if args.append else 'w') as fp:
|
|
|
+ for (heading, note) in tqdm(zip(note_headings, notes)):
|
|
|
+ heading_name = ''.join(list(heading.strings)).strip()
|
|
|
+ if heading_name != last_heading:
|
|
|
+ heading_title = args.title_format.format(title=title,
|
|
|
+ authors=authors,
|
|
|
+ heading=heading_name)
|
|
|
+ fp.write(f"\n\n{heading_stars} {heading_title}\n\n")
|
|
|
+ last_heading = heading_name
|
|
|
+ note_text = ''.join(list(note.strings)).strip()
|
|
|
+ fp.write(note_text)
|
|
|
+ fp.write('\n')
|