Getting suttacentral translations in plain text

harishvs · June 11, 2023, 7:35am

I am trying to question/answer AI chat bot which will be trained on the translations of tri pitakas and commentaries.

What is the best way to grab hold of all the content in plain text?

Thanks

michaelh · June 11, 2023, 7:45am

Hi @harishvs, welcome to the forum - glad you found us!

Enjoy the multiple resources here available: may these be of assistance along the path.

Should you have any questions about the forum, feel free to contact the @moderators.

With mettā,
@michaelh

michaelh · June 11, 2023, 8:53am

I’ve also been tinkering with this stuff too - which model are you thinking of using - Falcon? At the moment I think any Llama based model has specific licensing for research purposes only - it’s worth checking the licensing for any model you end up using. If you want to maybe craft your training/fine tuning against questions then answers, I’ve gone some way to identify who is speaking in the suttas, so use this if you like!: suttamap/data/speakers/sutta at main · michaelh-sc/suttamap · GitHub

There’s many methods to ingest the suttas into plain text, the way I do this is take some text input and perform a wildcard search with that of filenames inside the /translation folder of https://github.com/suttacentral/bilara-data but there is also formatting inside https://github.com/suttacentral/bilara-data/tree/published/html/pli/ms/sutta if you want to replicate some of the formatting on SC such as paragraphs.

There is some API documentation in this thread . Here is some python just for getting the suttas in text format, without chapter headings and without paragraphs and other line formatting:

import os
import sys
import re
import json
from natsort import natsorted
from pathlib import Path
from pathlib import PurePath

rootbil = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bilara-data')

def get_bilara_json(suttaglob, root_text=False, root_filename=None, target_dict_only=False):
  if root_text:
      lan = 'pli'
      filename_check = '_root-pli-ms.json'
  else:
      lan = 'en'
      filename_check = '_translation-en-sujato.json'
      filename_check_vin = '_translation-en-brahmali.json'

      def remove_prefix(text, prefix):
          if text.startswith(prefix):
              return text[len(prefix):]
          return text

  sutta_names = []
  bil_paths = []
  html_dicts = []
  bils = []

  global bilname, bilpath, sutta_name
  bil_file_paths = []
  for r, d, f in os.walk(rootbil):
    for file in f:
      if '.json' in file:
        bil_file_paths.append(os.path.join(r, file))

  def lan_key( aString ):
      lan = aString.split('html')[0], aString
      return lan

  paths = sorted(bil_file_paths, key=lan_key, reverse=True)

  bilpath = None

  for inbilname in natsorted(paths):
    sutta_sub_path = None
    htmlpath = None
    if (re.match(".*{}.*".format(suttaglob), inbilname) and lan in inbilname and (filename_check in inbilname or filename_check_vin in inbilname)) or (target_dict_only and any(targ in inbilname for targ in expl_targets) and lan in inbilname and (filename_check in inbilname or filename_check_vin in inbilname)):
      sutta_name = os.path.basename(inbilname).split('_')[0]
      sutta_names.append(sutta_name)
      bilpath_root = remove_prefix(inbilname, rootbil)
      bilpath_s = remove_prefix(bilpath_root, '/translation/en/sujato/sutta/')
      bilpath = remove_prefix(bilpath_s, '/translation/en/brahmali/vinaya/')
      bil_paths.append(bilpath)

      if '-tv-' in inbilname and 'brahmali' in inbilname: # vinaya
        sutta_sub_path = str(os.path.dirname(inbilname).split('en/brahmali/vinaya')[1])
        roothtmlpath = Path('html', 'pli', 'ms', 'vinaya')
      elif '-tv-' not in inbilname:
        sutta_sub_path = str(os.path.dirname(inbilname).split('en/sujato/sutta')[1])
        roothtmlpath = Path('html', 'pli', 'ms', 'sutta')
      else:
        continue
  
      htmlsubpath = Path(sutta_sub_path.lstrip(os.sep))
      htmlpath = os.path.join(rootbil, roothtmlpath, htmlsubpath, '{}_html.json'.format(sutta_name))
      if not os.path.isfile(htmlpath):
        print("Can't find html file for sutta {}. \n ({}) Exiting.".format(sutta_name, htmlpath))
        exit(0)
      html_dicts.append(json.load(open(htmlpath)))
      print(inbilname)
      inbil = json.load(open(Path(inbilname).as_posix(), encoding='utf-8'))
      outbil = {k: v for k, v in inbil.items() if v != " "}
      bils.append(outbil)
    else:
      continue

  if len(bils) == 0:
    print('no bilara found, exiting')
    exit(0)
  
  return [bils, html_dicts, sutta_names, bil_paths]

def do_the_thing(bil, bil_path, html_dict, sutta_name):
    sutta_text = ""
    for muid_in, line_in in bil.items():
        # Remove chapter titles and non-translated muids unless at last bilara line and it's empty
        if (len(line_in) <= 2 or re.match(r".*\.0$", muid_in) or re.match(r'.*\:0\..*', muid_in)):
            continue        
        sutta_text += line_in

    print(sutta_text)


if len(sys.argv) > 1:
  suttaglob = sys.argv[1]

bils, html_dicts, sutta_names, bil_paths = get_bilara_json(suttaglob)

for bil, html_dict, sutta_name, bil_path in zip(bils, html_dicts, sutta_names, bil_paths):
    do_the_thing(bil=bil, bil_path=bil_path, html_dict=html_dict, sutta_name=sutta_name)

harishvs · September 18, 2023, 5:59am

what is the input i should pass to this script ? the value for stuttaglob

michaelh · September 19, 2023, 2:37pm

It is regex of the sutta uid/filename. I sent an example as regex_search parameter for the 5 nikayas in a new message yesterday.