forked from pydata/patsy
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokens.py
More file actions
203 lines (178 loc) · 7.95 KB
/
tokens.py
File metadata and controls
203 lines (178 loc) · 7.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# This file is part of Patsy
# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
# See file LICENSE.txt for license information.
# Utilities for dealing with Python code at the token level.
#
# Includes:
# a "pretty printer" that converts a sequence of tokens back into a
# readable, white-space normalized string.
# a utility function to replace calls to global functions with calls to
# other functions
from io import StringIO
import tokenize
from patsy import PatsyError
from patsy.origin import Origin
__all__ = ["python_tokenize", "pretty_untokenize", "normalize_token_spacing"]
# A convenience wrapper around tokenize.generate_tokens. yields tuples
# (tokenize type, token string, origin object)
def python_tokenize(code):
# Since formulas can only contain Python expressions, and Python
# expressions cannot meaningfully contain newlines, we'll just remove all
# the newlines up front to avoid any complications:
code = code.replace("\n", " ").strip()
it = tokenize.generate_tokens(StringIO(code).readline)
try:
for pytype, string, (_, start), (_, end), code in it:
if pytype == tokenize.ENDMARKER:
break
if pytype in (tokenize.NL, tokenize.NEWLINE):
assert string == ""
continue
origin = Origin(code, start, end)
if pytype == tokenize.ERRORTOKEN:
raise PatsyError(
"error tokenizing input (maybe an unclosed string?)", origin
)
if pytype == tokenize.COMMENT:
raise PatsyError("comments are not allowed", origin)
yield (pytype, string, origin)
else: # pragma: no cover
raise ValueError("stream ended without ENDMARKER?!?")
except tokenize.TokenError as e:
# TokenError is raised iff the tokenizer thinks that there is
# some sort of multi-line construct in progress (e.g., an
# unclosed parentheses, which in Python lets a virtual line
# continue past the end of the physical line), and it hits the
# end of the source text. We have our own error handling for
# such cases, so just treat this as an end-of-stream.
#
if "unterminated string literal" in e.args[0]:
raise PatsyError(
"error tokenizing input ({})".format(e.args[0]),
Origin(code, 0, len(code)),
)
# Just in case someone adds some other error case:
assert "EOF in multi-line" in e.args[0]
return
def test_python_tokenize():
code = "a + (foo * -1)"
tokens = list(python_tokenize(code))
expected = [
(tokenize.NAME, "a", Origin(code, 0, 1)),
(tokenize.OP, "+", Origin(code, 2, 3)),
(tokenize.OP, "(", Origin(code, 4, 5)),
(tokenize.NAME, "foo", Origin(code, 5, 8)),
(tokenize.OP, "*", Origin(code, 9, 10)),
(tokenize.OP, "-", Origin(code, 11, 12)),
(tokenize.NUMBER, "1", Origin(code, 12, 13)),
(tokenize.OP, ")", Origin(code, 13, 14)),
]
assert tokens == expected
code2 = "a + (b"
tokens2 = list(python_tokenize(code2))
expected2 = [
(tokenize.NAME, "a", Origin(code2, 0, 1)),
(tokenize.OP, "+", Origin(code2, 2, 3)),
(tokenize.OP, "(", Origin(code2, 4, 5)),
(tokenize.NAME, "b", Origin(code2, 5, 6)),
]
assert tokens2 == expected2
import pytest
pytest.raises(PatsyError, list, python_tokenize("a b # c"))
import pytest
pytest.raises(PatsyError, list, python_tokenize('a b "c'))
_python_space_both = list("+-*/%&^|<>") + [
"==",
"<>",
"!=",
"<=",
">=",
"<<",
">>",
"**",
"//",
]
_python_space_before = _python_space_both + ["!", "~"]
_python_space_after = _python_space_both + [",", ":"]
def pretty_untokenize(typed_tokens):
text = []
prev_was_space_delim = False
prev_wants_space = False
prev_was_open_paren_or_comma = False
prev_was_object_like = False
brackets = []
for token_type, token in typed_tokens:
assert token_type not in (tokenize.INDENT, tokenize.DEDENT, tokenize.NL)
if token_type == tokenize.NEWLINE:
continue
if token_type == tokenize.ENDMARKER:
continue
if token_type in (tokenize.NAME, tokenize.NUMBER, tokenize.STRING):
if prev_wants_space or prev_was_space_delim:
text.append(" ")
text.append(token)
prev_wants_space = False
prev_was_space_delim = True
else:
if token in ("(", "[", "{"):
brackets.append(token)
elif brackets and token in (")", "]", "}"):
brackets.pop()
this_wants_space_before = token in _python_space_before
this_wants_space_after = token in _python_space_after
# Special case for slice syntax: foo[:10]
# Otherwise ":" is spaced after, like: "{1: ...}", "if a: ..."
if token == ":" and brackets and brackets[-1] == "[":
this_wants_space_after = False
# Special case for foo(*args), foo(a, *args):
if token in ("*", "**") and prev_was_open_paren_or_comma:
this_wants_space_before = False
this_wants_space_after = False
# Special case for "a = foo(b=1)":
if token == "=" and not brackets:
this_wants_space_before = True
this_wants_space_after = True
# Special case for unary -, +. Our heuristic is that if we see the
# + or - after something that looks like an object (a NAME,
# NUMBER, STRING, or close paren) then it is probably binary,
# otherwise it is probably unary.
if token in ("+", "-") and not prev_was_object_like:
this_wants_space_before = False
this_wants_space_after = False
if prev_wants_space or this_wants_space_before:
text.append(" ")
text.append(token)
prev_wants_space = this_wants_space_after
prev_was_space_delim = False
if (
token_type in (tokenize.NAME, tokenize.NUMBER, tokenize.STRING)
or token == ")"
):
prev_was_object_like = True
else:
prev_was_object_like = False
prev_was_open_paren_or_comma = token in ("(", ",")
return "".join(text)
def normalize_token_spacing(code):
tokens = [(t[0], t[1]) for t in tokenize.generate_tokens(StringIO(code).readline)]
return pretty_untokenize(tokens)
def test_pretty_untokenize_and_normalize_token_spacing():
assert normalize_token_spacing("1 + 1") == "1 + 1"
assert normalize_token_spacing("1+1") == "1 + 1"
assert normalize_token_spacing("1*(2+3**2)") == "1 * (2 + 3 ** 2)"
assert normalize_token_spacing("a and b") == "a and b"
assert normalize_token_spacing("foo(a=bar.baz[1:])") == "foo(a=bar.baz[1:])"
assert normalize_token_spacing("""{"hi":foo[:]}""") == """{"hi": foo[:]}"""
assert normalize_token_spacing("""'a' "b" 'c'""") == """'a' "b" 'c'"""
assert normalize_token_spacing('"""a""" is 1 or 2==3') == '"""a""" is 1 or 2 == 3'
assert normalize_token_spacing("foo ( * args )") == "foo(*args)"
assert normalize_token_spacing("foo ( a * args )") == "foo(a * args)"
assert normalize_token_spacing("foo ( ** args )") == "foo(**args)"
assert normalize_token_spacing("foo ( a ** args )") == "foo(a ** args)"
assert normalize_token_spacing("foo (1, * args )") == "foo(1, *args)"
assert normalize_token_spacing("foo (1, a * args )") == "foo(1, a * args)"
assert normalize_token_spacing("foo (1, ** args )") == "foo(1, **args)"
assert normalize_token_spacing("foo (1, a ** args )") == "foo(1, a ** args)"
assert normalize_token_spacing("a=foo(b = 1)") == "a = foo(b=1)"
assert normalize_token_spacing("foo(+ 10, bar = - 1)") == "foo(+10, bar=-1)"
assert normalize_token_spacing("1 + +10 + -1 - 5") == "1 + +10 + -1 - 5"