forked from isocpp/CppCoreGuidelines
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmd-split.py
More file actions
executable file
·177 lines (148 loc) · 6.51 KB
/
md-split.py
File metadata and controls
executable file
·177 lines (148 loc) · 6.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#! /usr/bin/env python
# A script that splits a Markdown file into plain text (for spell checking) and c++ files.
from __future__ import absolute_import, print_function, unicode_literals
import os
import shutil
import io
import argparse
import re, cgi
TAG_REGEX = re.compile(r'(<!--.*?-->|<[^>]*>)')
NAMED_A_TAG_REGEX = re.compile(r'.*name ?= ?"([^"]*)"')
def main():
"""
This script ended up ugly, so in case somebody wants to reimplement, here is the spec that grew by time.
What it should do it take a markdown file, and split it into more files. A targetfile should have the same number of lines as the original, with source code snippets and markdown non-words removed, for spell-checking.
Each code snipped should go into a separate file in codedir.
Each code snipped should get additional C++ code around it to help compile the line in context, with some heuristic guessing of what is needed around. The wrapping code should have a token in each line allowing other tools to filter out these lines
The name for each file chosen consists os the section id in the markdown document, a counter for the snippet inside the section.
Snippets without code (only comments) or containing lines starting with ??? should not yeld files, but the counter for naming snippets should still increment.
"""
parser = argparse.ArgumentParser(description='Split md file into plain text and code blocks')
parser.add_argument('sourcefile',
help='which file to read')
parser.add_argument('targetfile',
help='where to put plain text')
parser.add_argument('codedir',
help='where to put codeblocks')
args = parser.parse_args()
# ensure folder exists
if not os.path.exists(args.codedir):
os.makedirs(args.codedir)
if os.path.exists(args.targetfile):
os.remove(args.targetfile)
code_block_index = 0
last_header = ''
linenum = 0
with io.open(args.sourcefile, 'r') as read_filehandle:
with io.open(args.targetfile, 'w') as text_filehandle:
for line in read_filehandle:
linenum += 1
indent_depth = is_code(line)
if indent_depth:
(line, linenum) = process_code(read_filehandle,
text_filehandle,
line, linenum,
args.sourcefile, args.codedir,
last_header, code_block_index,
indent_depth)
code_block_index += 1
# reach here either line was not code, or was code
# and we dealt with n code lines
if not is_code(line, indent_depth):
# store header id for codeblock
section_id = get_marker(line)
if section_id is not None:
code_block_index = 0
last_header = section_id
sline = stripped(line)
text_filehandle.write(sline)
def process_code(read_filehandle, text_filehandle, line, linenum, sourcefile, codedir, name, index, indent_depth):
fenced = (line.strip() == '```')
if fenced:
try:
line = read_filehandle.next()
linenum += 1
text_filehandle.write('')
except StopIteration:
return ('', linenum)
start_linenum = linenum
has_actual_code = False
has_question_marks = False
linebuffer = []
while ((fenced and line.strip() != '```') or (not fenced and is_inside_code(line, indent_depth))):
# copy comments to plain text for spell check
comment_idx = line.find('//')
no_comment_line = line
if comment_idx >= 0:
no_comment_line = line[:comment_idx]
text_filehandle.write(line[comment_idx + 2:])
if (not has_actual_code
and not line.strip().startswith('//')
and not line.strip().startswith('???')
and not line.strip() ==''):
has_actual_code = True
else:
# write empty line so line numbers stay stable
text_filehandle.write('')
if (not line.strip() == '```'):
if ('???' in no_comment_line or '...' in no_comment_line):
has_question_marks = True
linebuffer.append(dedent(line) if not fenced else line)
try:
line = read_filehandle.next()
linenum += 1
except StopIteration:
line = ''
break
codefile = os.path.join(codedir, '%s%s.cpp' % (name, index))
if fenced:
text_filehandle.write('')
if (has_actual_code and not has_question_marks):
# add commonly used headers, so that lines can compile
with io.open(codefile, 'w') as code_filehandle:
code_filehandle.write('''\
#include<stdio.h> // by md-split
#include<stdlib.h> // by md-split
#include<tuple> // by md-split
#include<utility> // by md-split
#include<limits> // by md-split
#include<functional> // by md-split
#include<string> // by md-split
#include<map> // by md-split
#include<iostream> // by md-split
#include<vector> // by md-split
#include<algorithm> // by md-split
#include<memory> // by md-split
using namespace std; // by md-split
// %s : %s
''' % (sourcefile, start_linenum))
# TODO: if not toplevel code, wrap inside class
for line in linebuffer:
code_filehandle.write(line)
return (line, linenum)
def is_code(line, indent_depth = 4):
if line.startswith(' ' * indent_depth):
return len(line) - len(line.lstrip(' '))
return 0
def is_inside_code(line, indent_depth):
return is_code(line, indent_depth) or line.strip() == ''
def stripped(line):
# Remove well-formed html tags, fixing mistakes by legitimate users
sline = TAG_REGEX.sub('', line)
sline = re.sub('[()\[\]#*]', ' ', line)
return sline
def dedent(line):
if line.startswith(' '):
return line[4:]
if line.startswith('\t'):
return line[1:]
return line
def get_marker(line):
matchlist = TAG_REGEX.findall(line)
if matchlist:
namematch = NAMED_A_TAG_REGEX.match(line)
if namematch:
return namematch.group(1) # group 0 is full match
return None
if __name__ == '__main__':
main()