PensePython2/tools/join_glossary.py at master · PensePython/PensePython2

History

93 lines (77 loc) · 2.76 KB

Raw

#!/usr/bin/env python3

"""

Extract glossary entries from chapters 1-19 and appendix B and create

a consolidated glossary text file delimited by '|' (pipe characters).

"""

import glob

import re

import os

import collections

GLOSSARY = r'Gloss[^-]{3,7}-{8,15}\n(.*)'

GLOSSARY_RE = re.compile(GLOSSARY, re.DOTALL)

GLOSSARY_SECTION_RE = re.compile(GLOSSARY + r'-{8,15}', re.DOTALL)

expected_entries = [ # glossary entries per chapter

('01', 21),

('02', 19),

('03', 22),

('04', 11),

('05', 15),

('06', 5),

('07', 8),

('08', 12),

('09', 3),

('10', 14),

('11', 20),

('12', 8),

('13', 6),

('14', 14),

('15', 9),

('16', 7),

('17', 9),

('18', 13),

('19', 5),

( 'B', 11),

]

expected_entries_dic = dict(expected_entries)

# \n(.*?)\n\n

ENTRY_RE = re.compile(r'([^\n]+):\n[ ]+(.*?)\n\n', re.DOTALL)

GlossaryEntry = collections.namedtuple('GlossaryEntry', 'term definition')

Definition = collections.namedtuple('Definition', 'chapter_id position text')

def parse_entries(text, chapter_id):

matches = ENTRY_RE.findall(text)

entries = []

for position, match in enumerate(matches, 1):

term = match[0]

definition_text = ' '.join(match[1].split())

#print(term, '::', definition_text)

entries.append(GlossaryEntry(term,

Definition(chapter_id, position, definition_text)))

return entries

def scan_files(*paths):

entries = collections.defaultdict(list)

for path in paths:

for name in glob.glob(os.path.join(path, '*.rst')):

chapter_id = os.path.basename(name).split('-')[0]

with open(name, encoding='utf-8') as infile:

rst = infile.read()

gloss_match = (GLOSSARY_SECTION_RE.search(rst) or

GLOSSARY_RE.search(rst))

if gloss_match:

#print('*' * 40, name)

new_entries = parse_entries(gloss_match.group(1), chapter_id)

for term, definition in new_entries:

#if term in entries:

# print('duplicate term:', term)

entries[term].append(definition)

#print(len(new_entries))

assert expected_entries_dic[chapter_id] == len(new_entries), (

chapter_id, expected_entries_dic[chapter_id], len(new_entries))

for term in sorted(entries, key=str.upper):

definitions = entries[term]

for i, (chapter_id, position, definition) in enumerate(sorted(definitions)):

if i:

term = '\t'

print(term, chapter_id, position, definition, sep='|')

if __name__ == '__main__':

import sys

scan_files(*sys.argv[1:])

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

join_glossary.py

Latest commit

History

join_glossary.py

File metadata and controls