patsy/patsy/tokens.py at master · IBMZ-Linux-OSS-Python/patsy

History

203 lines (178 loc) · 7.95 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

# This file is part of Patsy

# See file LICENSE.txt for license information.

# Utilities for dealing with Python code at the token level.

# Includes:

# a "pretty printer" that converts a sequence of tokens back into a

# readable, white-space normalized string.

# a utility function to replace calls to global functions with calls to

# other functions

from io import StringIO

import tokenize

from patsy import PatsyError

from patsy.origin import Origin

__all__ = ["python_tokenize", "pretty_untokenize", "normalize_token_spacing"]

# A convenience wrapper around tokenize.generate_tokens. yields tuples

# (tokenize type, token string, origin object)

def python_tokenize(code):

# Since formulas can only contain Python expressions, and Python

# expressions cannot meaningfully contain newlines, we'll just remove all

# the newlines up front to avoid any complications:

code = code.replace("\n", " ").strip()

it = tokenize.generate_tokens(StringIO(code).readline)

try:

for pytype, string, (_, start), (_, end), code in it:

if pytype == tokenize.ENDMARKER:

break

if pytype in (tokenize.NL, tokenize.NEWLINE):

assert string == ""

continue

origin = Origin(code, start, end)

if pytype == tokenize.ERRORTOKEN:

raise PatsyError(

"error tokenizing input (maybe an unclosed string?)", origin

)

if pytype == tokenize.COMMENT:

raise PatsyError("comments are not allowed", origin)

yield (pytype, string, origin)

else: # pragma: no cover

raise ValueError("stream ended without ENDMARKER?!?")

except tokenize.TokenError as e:

# TokenError is raised iff the tokenizer thinks that there is

# some sort of multi-line construct in progress (e.g., an

# unclosed parentheses, which in Python lets a virtual line

# continue past the end of the physical line), and it hits the

# end of the source text. We have our own error handling for

# such cases, so just treat this as an end-of-stream.

if "unterminated string literal" in e.args[0]:

raise PatsyError(

"error tokenizing input ({})".format(e.args[0]),

Origin(code, 0, len(code)),

)

# Just in case someone adds some other error case:

assert "EOF in multi-line" in e.args[0]

return

def test_python_tokenize():

code = "a + (foo * -1)"

tokens = list(python_tokenize(code))

expected = [

(tokenize.NAME, "a", Origin(code, 0, 1)),

(tokenize.OP, "+", Origin(code, 2, 3)),

(tokenize.OP, "(", Origin(code, 4, 5)),

(tokenize.NAME, "foo", Origin(code, 5, 8)),

(tokenize.OP, "*", Origin(code, 9, 10)),

(tokenize.OP, "-", Origin(code, 11, 12)),

(tokenize.NUMBER, "1", Origin(code, 12, 13)),

(tokenize.OP, ")", Origin(code, 13, 14)),

]

assert tokens == expected

code2 = "a + (b"

tokens2 = list(python_tokenize(code2))

expected2 = [

(tokenize.NAME, "a", Origin(code2, 0, 1)),

(tokenize.OP, "+", Origin(code2, 2, 3)),

(tokenize.OP, "(", Origin(code2, 4, 5)),

(tokenize.NAME, "b", Origin(code2, 5, 6)),

]

assert tokens2 == expected2

import pytest

pytest.raises(PatsyError, list, python_tokenize("a b # c"))

import pytest

pytest.raises(PatsyError, list, python_tokenize('a b "c'))

_python_space_both = list("+-*/%&^|<>") + [

"==",

"<>",

"!=",

"<=",

">=",

"<<",

">>",

"**",

"//",

]

_python_space_before = _python_space_both + ["!", "~"]

_python_space_after = _python_space_both + [",", ":"]

def pretty_untokenize(typed_tokens):

text = []

prev_was_space_delim = False

prev_wants_space = False

prev_was_open_paren_or_comma = False

prev_was_object_like = False

brackets = []

for token_type, token in typed_tokens:

assert token_type not in (tokenize.INDENT, tokenize.DEDENT, tokenize.NL)

if token_type == tokenize.NEWLINE:

continue

if token_type == tokenize.ENDMARKER:

continue

if token_type in (tokenize.NAME, tokenize.NUMBER, tokenize.STRING):

if prev_wants_space or prev_was_space_delim:

text.append(" ")

text.append(token)

prev_wants_space = False

prev_was_space_delim = True

else:

if token in ("(", "[", "{"):

brackets.append(token)

elif brackets and token in (")", "]", "}"):

brackets.pop()

this_wants_space_before = token in _python_space_before

this_wants_space_after = token in _python_space_after

# Special case for slice syntax: foo[:10]

# Otherwise ":" is spaced after, like: "{1: ...}", "if a: ..."

if token == ":" and brackets and brackets[-1] == "[":

this_wants_space_after = False

# Special case for foo(*args), foo(a, *args):

if token in ("*", "**") and prev_was_open_paren_or_comma:

this_wants_space_before = False

this_wants_space_after = False

# Special case for "a = foo(b=1)":

if token == "=" and not brackets:

this_wants_space_before = True

this_wants_space_after = True

# Special case for unary -, +. Our heuristic is that if we see the

# + or - after something that looks like an object (a NAME,

# NUMBER, STRING, or close paren) then it is probably binary,

# otherwise it is probably unary.

if token in ("+", "-") and not prev_was_object_like:

this_wants_space_before = False

this_wants_space_after = False

if prev_wants_space or this_wants_space_before:

text.append(" ")

text.append(token)

prev_wants_space = this_wants_space_after

prev_was_space_delim = False

if (

token_type in (tokenize.NAME, tokenize.NUMBER, tokenize.STRING)

or token == ")"

prev_was_object_like = True

else:

prev_was_object_like = False

prev_was_open_paren_or_comma = token in ("(", ",")

return "".join(text)

def normalize_token_spacing(code):

tokens = [(t[0], t[1]) for t in tokenize.generate_tokens(StringIO(code).readline)]

return pretty_untokenize(tokens)

def test_pretty_untokenize_and_normalize_token_spacing():

assert normalize_token_spacing("1 + 1") == "1 + 1"

assert normalize_token_spacing("1+1") == "1 + 1"

assert normalize_token_spacing("1*(2+3**2)") == "1 * (2 + 3 ** 2)"

assert normalize_token_spacing("a and b") == "a and b"

assert normalize_token_spacing("foo(a=bar.baz[1:])") == "foo(a=bar.baz[1:])"

assert normalize_token_spacing("""{"hi":foo[:]}""") == """{"hi": foo[:]}"""

assert normalize_token_spacing("""'a' "b" 'c'""") == """'a' "b" 'c'"""

assert normalize_token_spacing('"""a""" is 1 or 2==3') == '"""a""" is 1 or 2 == 3'

assert normalize_token_spacing("foo ( * args )") == "foo(*args)"

assert normalize_token_spacing("foo ( a * args )") == "foo(a * args)"

assert normalize_token_spacing("foo ( ** args )") == "foo(**args)"

assert normalize_token_spacing("foo ( a ** args )") == "foo(a ** args)"

assert normalize_token_spacing("foo (1, * args )") == "foo(1, *args)"

assert normalize_token_spacing("foo (1, a * args )") == "foo(1, a * args)"

assert normalize_token_spacing("foo (1, ** args )") == "foo(1, **args)"

assert normalize_token_spacing("foo (1, a ** args )") == "foo(1, a ** args)"

assert normalize_token_spacing("a=foo(b = 1)") == "a = foo(b=1)"

assert normalize_token_spacing("foo(+ 10, bar = - 1)") == "foo(+10, bar=-1)"

assert normalize_token_spacing("1 + +10 + -1 - 5") == "1 + +10 + -1 - 5"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

tokens.py

Latest commit

History

tokens.py

File metadata and controls