#!/usr/bin/env -S nix-shell # -*- python -*- #!nix-shell -i python3 -p "python3.withPackages(ps: with ps; [ beautifulsoup4 html5lib tqdm ])" # coding: utf-8 import os.path as osp import sys from argparse import ArgumentParser from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning from tqdm import tqdm import warnings warnings.filterwarnings(action='ignore', category=XMLParsedAsHTMLWarning, module='bs4') ap = ArgumentParser(description='Convert Amazon Highlights HTML to an Org file') ap.add_argument('-a', '--append', action='store_true', default=False, help='append to output file, do not create (default, create)') ap.add_argument('-t', '--title', dest='title_format', default='{heading}', help='format string for headers: {title}, {authors} and {heading} are available') ap.add_argument('-d', '--depth', dest='heading_depth', default=1, type=int, help='how deep are the headings') ap.add_argument('in_file', type=str, help='input HTML file from Amazon Kindle highlights') ap.add_argument('out_file', type=str, help='output Org file') args = ap.parse_args() heading_stars = '*' * args.heading_depth if not osp.exists(args.in_file): print(f"File \"{args.in_file}\" does not exist.", file = sys.stderr) sys.exit(1) with open(args.in_file) as fp: soup = BeautifulSoup(fp, 'html5lib') title = soup.find("div", "bookTitle").string.strip() authors = ' and '.join(list(soup.find("div", "authors").strings)).strip() note_headings = soup.find_all("div", "noteHeading") notes = soup.find_all("div", "noteText") last_heading = '' with open(args.out_file, 'a' if args.append else 'w') as fp: for (heading, note) in tqdm(zip(note_headings, notes)): heading_name = ''.join(list(heading.strings)).strip() if heading_name != last_heading: heading_title = args.title_format.format(title=title, authors=authors, heading=heading_name) fp.write(f"\n\n{heading_stars} {heading_title}\n\n") last_heading = heading_name note_text = ''.join(list(note.strings)).strip() fp.write(note_text) fp.write('\n')