Getting suttacentral translations in plain text

michaelh · June 11, 2023, 8:53am

I’ve also been tinkering with this stuff too - which model are you thinking of using - Falcon? At the moment I think any Llama based model has specific licensing for research purposes only - it’s worth checking the licensing for any model you end up using. If you want to maybe craft your training/fine tuning against questions then answers, I’ve gone some way to identify who is speaking in the suttas, so use this if you like!: suttamap/data/speakers/sutta at main · michaelh-sc/suttamap · GitHub

There’s many methods to ingest the suttas into plain text, the way I do this is take some text input and perform a wildcard search with that of filenames inside the /translation folder of https://github.com/suttacentral/bilara-data but there is also formatting inside https://github.com/suttacentral/bilara-data/tree/published/html/pli/ms/sutta if you want to replicate some of the formatting on SC such as paragraphs.

There is some API documentation in this thread . Here is some python just for getting the suttas in text format, without chapter headings and without paragraphs and other line formatting:

import os
import sys
import re
import json
from natsort import natsorted
from pathlib import Path
from pathlib import PurePath

rootbil = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bilara-data')

def get_bilara_json(suttaglob, root_text=False, root_filename=None, target_dict_only=False):
  if root_text:
      lan = 'pli'
      filename_check = '_root-pli-ms.json'
  else:
      lan = 'en'
      filename_check = '_translation-en-sujato.json'
      filename_check_vin = '_translation-en-brahmali.json'

      def remove_prefix(text, prefix):
          if text.startswith(prefix):
              return text[len(prefix):]
          return text

  sutta_names = []
  bil_paths = []
  html_dicts = []
  bils = []

  global bilname, bilpath, sutta_name
  bil_file_paths = []
  for r, d, f in os.walk(rootbil):
    for file in f:
      if '.json' in file:
        bil_file_paths.append(os.path.join(r, file))

  def lan_key( aString ):
      lan = aString.split('html')[0], aString
      return lan

  paths = sorted(bil_file_paths, key=lan_key, reverse=True)

  bilpath = None

  for inbilname in natsorted(paths):
    sutta_sub_path = None
    htmlpath = None
    if (re.match(".*{}.*".format(suttaglob), inbilname) and lan in inbilname and (filename_check in inbilname or filename_check_vin in inbilname)) or (target_dict_only and any(targ in inbilname for targ in expl_targets) and lan in inbilname and (filename_check in inbilname or filename_check_vin in inbilname)):
      sutta_name = os.path.basename(inbilname).split('_')[0]
      sutta_names.append(sutta_name)
      bilpath_root = remove_prefix(inbilname, rootbil)
      bilpath_s = remove_prefix(bilpath_root, '/translation/en/sujato/sutta/')
      bilpath = remove_prefix(bilpath_s, '/translation/en/brahmali/vinaya/')
      bil_paths.append(bilpath)

      if '-tv-' in inbilname and 'brahmali' in inbilname: # vinaya
        sutta_sub_path = str(os.path.dirname(inbilname).split('en/brahmali/vinaya')[1])
        roothtmlpath = Path('html', 'pli', 'ms', 'vinaya')
      elif '-tv-' not in inbilname:
        sutta_sub_path = str(os.path.dirname(inbilname).split('en/sujato/sutta')[1])
        roothtmlpath = Path('html', 'pli', 'ms', 'sutta')
      else:
        continue
  
      htmlsubpath = Path(sutta_sub_path.lstrip(os.sep))
      htmlpath = os.path.join(rootbil, roothtmlpath, htmlsubpath, '{}_html.json'.format(sutta_name))
      if not os.path.isfile(htmlpath):
        print("Can't find html file for sutta {}. \n ({}) Exiting.".format(sutta_name, htmlpath))
        exit(0)
      html_dicts.append(json.load(open(htmlpath)))
      print(inbilname)
      inbil = json.load(open(Path(inbilname).as_posix(), encoding='utf-8'))
      outbil = {k: v for k, v in inbil.items() if v != " "}
      bils.append(outbil)
    else:
      continue

  if len(bils) == 0:
    print('no bilara found, exiting')
    exit(0)
  
  return [bils, html_dicts, sutta_names, bil_paths]

def do_the_thing(bil, bil_path, html_dict, sutta_name):
    sutta_text = ""
    for muid_in, line_in in bil.items():
        # Remove chapter titles and non-translated muids unless at last bilara line and it's empty
        if (len(line_in) <= 2 or re.match(r".*\.0$", muid_in) or re.match(r'.*\:0\..*', muid_in)):
            continue        
        sutta_text += line_in

    print(sutta_text)


if len(sys.argv) > 1:
  suttaglob = sys.argv[1]

bils, html_dicts, sutta_names, bil_paths = get_bilara_json(suttaglob)

for bil, html_dict, sutta_name, bil_path in zip(bils, html_dicts, sutta_names, bil_paths):
    do_the_thing(bil=bil, bil_path=bil_path, html_dict=html_dict, sutta_name=sutta_name)