|
| 1 | +import glob |
| 2 | +import re |
| 3 | +import os |
| 4 | +import collections |
| 5 | + |
| 6 | +GLOSSARY = r'Gloss[^-]{3,7}-{8,15}\n(.*)' |
| 7 | + |
| 8 | +GLOSSARY_RE = re.compile(GLOSSARY, re.DOTALL) |
| 9 | + |
| 10 | +GLOSSARY_SECTION_RE = re.compile(GLOSSARY + r'-{8,15}', re.DOTALL) |
| 11 | + |
| 12 | +expected_entries = [ # glossary entries per chapter |
| 13 | + ('01', 21), |
| 14 | + ('02', 19), |
| 15 | + ('03', 22), |
| 16 | + ('04', 11), |
| 17 | + ('05', 15), |
| 18 | + ('06', 5), |
| 19 | + ('07', 8), |
| 20 | + ('08', 12), |
| 21 | + ('09', 3), |
| 22 | + ('10', 14), |
| 23 | + ('11', 20), |
| 24 | + ('12', 8), |
| 25 | + ('13', 6), |
| 26 | + ('14', 14), |
| 27 | + ('15', 9), |
| 28 | + ('16', 7), |
| 29 | + ('17', 9), |
| 30 | + ('18', 13), |
| 31 | + ('19', 5), |
| 32 | + ( 'B', 11), |
| 33 | +] |
| 34 | + |
| 35 | +expected_entries_dic = dict(expected_entries) |
| 36 | + |
| 37 | +# \n(.*?)\n\n |
| 38 | +ENTRY_RE = re.compile(r'([^\n]+):\n[ ]+(.*?)\n\n', re.DOTALL) |
| 39 | + |
| 40 | +GlossaryEntry = collections.namedtuple('GlossaryEntry', 'term definition') |
| 41 | +Definition = collections.namedtuple('Definition', 'chapter_id position text') |
| 42 | + |
| 43 | + |
| 44 | +def parse_entries(text, chapter_id): |
| 45 | + matches = ENTRY_RE.findall(text) |
| 46 | + entries = [] |
| 47 | + for position, match in enumerate(matches, 1): |
| 48 | + term = match[0] |
| 49 | + definition_text = ' '.join(match[1].split()) |
| 50 | + #print(term, '::', definition_text) |
| 51 | + entries.append(GlossaryEntry(term, |
| 52 | + Definition(chapter_id, position, definition_text))) |
| 53 | + return entries |
| 54 | + |
| 55 | + |
| 56 | +def scan_files(*paths): |
| 57 | + entries = collections.defaultdict(list) |
| 58 | + for path in paths: |
| 59 | + for name in glob.glob(os.path.join(path, '*.rst')): |
| 60 | + chapter_id = os.path.basename(name).split('-')[0] |
| 61 | + |
| 62 | + with open(name, encoding='utf-8') as infile: |
| 63 | + rst = infile.read() |
| 64 | + gloss_match = (GLOSSARY_SECTION_RE.search(rst) or |
| 65 | + GLOSSARY_RE.search(rst)) |
| 66 | + if gloss_match: |
| 67 | + #print('*' * 40, name) |
| 68 | + new_entries = parse_entries(gloss_match.group(1), chapter_id) |
| 69 | + for term, definition in new_entries: |
| 70 | + #if term in entries: |
| 71 | + # print('duplicate term:', term) |
| 72 | + entries[term].append(definition) |
| 73 | + #print(len(new_entries)) |
| 74 | + assert expected_entries_dic[chapter_id] == len(new_entries), ( |
| 75 | + chapter_id, expected_entries_dic[chapter_id], len(new_entries)) |
| 76 | + for term in sorted(entries, key=str.upper): |
| 77 | + definitions = entries[term] |
| 78 | + for i, (chapter_id, position, definition) in enumerate(sorted(definitions)): |
| 79 | + if i: |
| 80 | + term = '\t' |
| 81 | + print(term, chapter_id, position, definition, sep='|') |
| 82 | + |
| 83 | + |
| 84 | +if __name__ == '__main__': |
| 85 | + import sys |
| 86 | + scan_files(*sys.argv[1:]) |
0 commit comments