123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- #!/usr/bin/env -S nix-shell # -*- python -*-
- #!nix-shell -i python3 -p "python3.withPackages(ps: with ps; [ beautifulsoup4 html5lib tqdm ])"
- # coding: utf-8
- import os.path as osp
- import sys
- from argparse import ArgumentParser
- from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
- from tqdm import tqdm
- import warnings
- warnings.filterwarnings(action='ignore',
- category=XMLParsedAsHTMLWarning,
- module='bs4')
- ap = ArgumentParser(description='Convert Amazon Highlights HTML to an Org file')
- ap.add_argument('-a', '--append',
- action='store_true',
- default=False,
- help='append to output file, do not create (default, create)')
- ap.add_argument('-t', '--title',
- dest='title_format',
- default='{heading}',
- help='format string for headers: {title}, {authors} and {heading} are available')
- ap.add_argument('-d', '--depth',
- dest='heading_depth',
- default=1,
- type=int,
- help='how deep are the headings')
- ap.add_argument('in_file',
- type=str,
- help='input HTML file from Amazon Kindle highlights')
- ap.add_argument('out_file',
- type=str,
- help='output Org file')
- args = ap.parse_args()
- heading_stars = '*' * args.heading_depth
- if not osp.exists(args.in_file):
- print(f"File \"{args.in_file}\" does not exist.", file = sys.stderr)
- sys.exit(1)
- with open(args.in_file) as fp:
- soup = BeautifulSoup(fp, 'html5lib')
- title = soup.find("div", "bookTitle").string.strip()
- authors = ' and '.join(list(soup.find("div", "authors").strings)).strip()
- note_headings = soup.find_all("div", "noteHeading")
- notes = soup.find_all("div", "noteText")
- last_heading = ''
- with open(args.out_file, 'a' if args.append else 'w') as fp:
- for (heading, note) in tqdm(zip(note_headings, notes)):
- heading_name = ''.join(list(heading.strings)).strip()
- if heading_name != last_heading:
- heading_title = args.title_format.format(title=title,
- authors=authors,
- heading=heading_name)
- fp.write(f"\n\n{heading_stars} {heading_title}\n\n")
- last_heading = heading_name
- note_text = ''.join(list(note.strings)).strip()
- fp.write(note_text)
- fp.write('\n')
|