kindle-notes-to-org 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #!/usr/bin/env -S nix-shell # -*- python -*-
  2. #!nix-shell -i python3 -p "python3.withPackages(ps: with ps; [ beautifulsoup4 html5lib tqdm ])"
  3. # coding: utf-8
  4. import os.path as osp
  5. import sys
  6. from argparse import ArgumentParser
  7. from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
  8. from tqdm import tqdm
  9. import warnings
  10. warnings.filterwarnings(action='ignore',
  11. category=XMLParsedAsHTMLWarning,
  12. module='bs4')
  13. ap = ArgumentParser(description='Convert Amazon Highlights HTML to an Org file')
  14. ap.add_argument('-a', '--append',
  15. action='store_true',
  16. default=False,
  17. help='append to output file, do not create (default, create)')
  18. ap.add_argument('-t', '--title',
  19. dest='title_format',
  20. default='{heading}',
  21. help='format string for headers: {title}, {authors} and {heading} are available')
  22. ap.add_argument('-d', '--depth',
  23. dest='heading_depth',
  24. default=1,
  25. type=int,
  26. help='how deep are the headings')
  27. ap.add_argument('in_file',
  28. type=str,
  29. help='input HTML file from Amazon Kindle highlights')
  30. ap.add_argument('out_file',
  31. type=str,
  32. help='output Org file')
  33. args = ap.parse_args()
  34. heading_stars = '*' * args.heading_depth
  35. if not osp.exists(args.in_file):
  36. print(f"File \"{args.in_file}\" does not exist.", file = sys.stderr)
  37. sys.exit(1)
  38. with open(args.in_file) as fp:
  39. soup = BeautifulSoup(fp, 'html5lib')
  40. title = soup.find("div", "bookTitle").string.strip()
  41. authors = ' and '.join(list(soup.find("div", "authors").strings)).strip()
  42. note_headings = soup.find_all("div", "noteHeading")
  43. notes = soup.find_all("div", "noteText")
  44. last_heading = ''
  45. with open(args.out_file, 'a' if args.append else 'w') as fp:
  46. for (heading, note) in tqdm(zip(note_headings, notes)):
  47. heading_name = ''.join(list(heading.strings)).strip()
  48. if heading_name != last_heading:
  49. heading_title = args.title_format.format(title=title,
  50. authors=authors,
  51. heading=heading_name)
  52. fp.write(f"\n\n{heading_stars} {heading_title}\n\n")
  53. last_heading = heading_name
  54. note_text = ''.join(list(note.strings)).strip()
  55. fp.write(note_text)
  56. fp.write('\n')