ch03.md

Pyhton 中的字符串对象

字符串对象是变长的不可变类型，定义代码如下：

typedef struct {
    PyObject_VAR_HEAD
    long ob_shash;
    int ob_sstate;
    char ob_sval[1];

    /* Invariants:
     *     ob_sval contains space for 'ob_size+1' elements.
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the string or -1 if not computed yet.
     *     ob_sstate != 0 iff the string object is in stringobject.c's
     *       'interned' dictionary; in this case the two references
     *       from 'interned' to this object are *not counted* in ob_refcnt.
     */
} PyStringObject;

每个字段的含义看注释。

字符串哈希的计算方法：

static long
string_hash(PyStringObject *a)
{
	register Py_ssize_t len;
	register unsigned char *p;
	register long x;

	if (a->ob_shash != -1)
		return a->ob_shash;
	len = a->ob_size;
	p = (unsigned char *) a->ob_sval;
	x = *p << 7;
	while (--len>= 0)
		x = (1000003*x) ^ *p++;
	x ^= a->ob_size;
	if (x == -1)
		x = -2;
	a->ob_shash = x;
	return x;
}

从 C 中的字符串创建 PyStringObject 的代码如下：

/* This dictionary holds all interned strings.  Note that references to
   strings in this dictionary are *not* counted in the string's ob_refcnt.
   When the interned string reaches a refcnt of 0 the string deallocation
   function will delete the reference from this dictionary.

   Another way to look at this is that to say that the actual reference
   count of a string is:  s->ob_refcnt + (s->ob_sstate?2:0)
*/
static PyObject *interned;

/*
   For both PyString_FromString() and PyString_FromStringAndSize(), the
   parameter `size' denotes number of characters to allocate, not counting any
   null terminating character.

   For PyString_FromString(), the parameter `str' points to a null-terminated
   string containing exactly `size' bytes.

   For PyString_FromStringAndSize(), the parameter the parameter `str' is
   either NULL or else points to a string containing at least `size' bytes.
   For PyString_FromStringAndSize(), the string in the `str' parameter does
   not have to be null-terminated.  (Therefore it is safe to construct a
   substring by calling `PyString_FromStringAndSize(origstring, substrlen)'.)
   If `str'is NULL then PyString_FromStringAndSize() will allocate `size+1'
   bytes (setting the last byte to the null terminating character) and you can
   fill in the data yourself.  If `str' is non-NULL then the resulting
   PyString object must be treated as immutable and you must not fill in nor
   alter the data yourself, since the strings may be shared.

   The PyObject member `op->ob_size', which denotes the number of"extra
   items" in a variable-size object, will contain the number of bytes
   allocated for string data, not counting the null terminating character.  It
   is therefore equal to the equal to the `size' parameter (for
   PyString_FromStringAndSize()) or the length of the string in the `str'
   parameter (for PyString_FromString()).
*/
PyObject *
PyString_FromStringAndSize(const char *str, Py_ssize_t size)
{
	register PyStringObject *op;
	assert(size>= 0);
	if (size == 0 && (op = nullstring) != NULL) {
#ifdef COUNT_ALLOCS
		null_strings++;
#endif
		Py_INCREF(op);
		return (PyObject *)op;
	}
	if (size == 1 && str != NULL &&
	    (op = characters[*str & UCHAR_MAX]) != NULL)
	{
#ifdef COUNT_ALLOCS
		one_strings++;
#endif
		Py_INCREF(op);
		return (PyObject *)op;
	}

	/* Inline PyObject_NewVar */
	op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
	if (op == NULL)
		return PyErr_NoMemory();
	PyObject_INIT_VAR(op, &PyString_Type, size);
	op->ob_shash = -1;
	op->ob_sstate = SSTATE_NOT_INTERNED;
	if (str != NULL)
		Py_MEMCPY(op->ob_sval, str, size);
	op->ob_sval[size] = '\0';
	/* share short strings */
	if (size == 0) {
		PyObject *t = (PyObject *)op;
		PyString_InternInPlace(&t);
		op = (PyStringObject *)t;
		nullstring = op;
		Py_INCREF(op);
	} else if (size == 1 && str != NULL) {
		PyObject *t = (PyObject *)op;
		PyString_InternInPlace(&t);
		op = (PyStringObject *)t;
		characters[*str & UCHAR_MAX] = op;
		Py_INCREF(op);
	}
	return (PyObject *) op;
}

PyObject *
PyString_FromString(const char *str)
{
	register size_t size;
	register PyStringObject *op;

	assert(str != NULL);
	size = strlen(str);
	if (size> PY_SSIZE_T_MAX) {
		PyErr_SetString(PyExc_OverflowError,
			"string is too long for a Python string");
		return NULL;
	}
	if (size == 0 && (op = nullstring) != NULL) {
#ifdef COUNT_ALLOCS
		null_strings++;
#endif
		Py_INCREF(op);
		return (PyObject *)op;
	}
	if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
#ifdef COUNT_ALLOCS
		one_strings++;
#endif
		Py_INCREF(op);
		return (PyObject *)op;
	}

	/* Inline PyObject_NewVar */
	op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
	if (op == NULL)
		return PyErr_NoMemory();
	PyObject_INIT_VAR(op, &PyString_Type, size);
	op->ob_shash = -1;
	op->ob_sstate = SSTATE_NOT_INTERNED;
	Py_MEMCPY(op->ob_sval, str, size+1);
	/* share short strings */
	if (size == 0) {
		PyObject *t = (PyObject *)op;
		PyString_InternInPlace(&t);
		op = (PyStringObject *)t;
		nullstring = op;
		Py_INCREF(op);
	} else if (size == 1) {
		PyObject *t = (PyObject *)op;
		PyString_InternInPlace(&t);
		op = (PyStringObject *)t;
		characters[*str & UCHAR_MAX] = op;
		Py_INCREF(op);
	}
	return (PyObject *) op;
}

一些要点都在注释里面了。

另外要注意的一点是字符串的 intern 机制，intern 机制会共享短字符串（空字符串和长度为 1 的字符串）。上面代码开头的一段定义了一个变量 interned，interned 其实是一个字典 PyDict，保存了短字符串的映射关系。

在创建字符串时，如果该字符串是短字符串（假设为 "A"），如果 interned 中含有 "A"，则返回 "A"，并销毁之前创建的 PyStringObject（减少引用计数），如果 interned 中没有 "A"，则将 "A" 保存起来。

总的来说，intern 机制节省了字符串的内存使用率。即使你创建了 100 个短字符串 "A"，但是在 Python 内部其实只有一份 "A"，这 100 个 "A" 都引用了同一个 "A"。

下面的代码描述了如何共享短字符串：

void
PyString_InternInPlace(PyObject **p)
{
	register PyStringObject *s = (PyStringObject *)(*p);
	PyObject *t;
	if (s == NULL || !PyString_Check(s))
		Py_FatalError("PyString_InternInPlace: strings only please!");
	/* If it's a string subclass, we don't really know what putting
	   it in the interned dict might do. */
	if (!PyString_CheckExact(s))
		return;
	if (PyString_CHECK_INTERNED(s))
		return;
	if (interned == NULL) {
		interned = PyDict_New();
		if (interned == NULL) {
			PyErr_Clear(); /* Don't leave an exception */
			return;
		}
	}
	t = PyDict_GetItem(interned, (PyObject *)s);
	if (t) {
		Py_INCREF(t);
		Py_DECREF(*p);
		*p = t;
		return;
	}

	if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
		PyErr_Clear();
		return;
	}
	/* The two references in interned are not counted by refcnt.
	   The string deallocator will take care of this */
	s->ob_refcnt -= 2;
	PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
}

短字符串缓冲池

Python 使用 characters 数组保存了长度为 1 的字符串，当创建新的字符串时，如果 characters 中已经存在该字符串，则直接返回 characters 中的字符串。

static PyStringObject *characters[UCHAR_MAX + 1];
// UCHAR_MAX = 255

修改代码测试

好像 characters 不会缓冲'*'之类的特殊字符。

更新： 关于这个问题我在 CPyUG发过帖子。经过更多的测试，把我的发现记录一下。

通过Python shell测试和使用Python执行脚本文件进行测试的结果不一致，下面是两者的测试结果

使用 len('?') 打印出 characters 中不为 NULL 的元素。

Python shell

 >>> len('?')
 value: /(47), refcnt: 12
 value: 1(49), refcnt: 2
 value: E(69), refcnt: 1
 value: F(70), refcnt: 1
 value: N(78), refcnt: 1
 value: O(79), refcnt: 1
 value: R(82), refcnt: 1
 value: T(84), refcnt: 1
 value: W(87), refcnt: 1
 value: X(88), refcnt: 1
 value: _(95), refcnt: 7
 value: a(97), refcnt: 3
 value: c(99), refcnt: 4
 value: d(100), refcnt: 3
 value: e(101), refcnt: 2
 value: f(102), refcnt: 2
 value: g(103), refcnt: 1
 value: i(105), refcnt: 10
 value: k(107), refcnt: 8
 value: l(108), refcnt: 1
 value: m(109), refcnt: 6
 value: n(110), refcnt: 4
 value: o(111), refcnt: 1
 value: p(112), refcnt: 7
 value: r(114), refcnt: 2
 value: s(115), refcnt: 6
 value: t(116), refcnt: 4
 value: u(117), refcnt: 1
 value: w(119), refcnt: 3

python t.py

 # t.py
 len('?')

 # 输出结果
 value: /(47), refcnt: 12
 value: 1(49), refcnt: 2
 value: ?(63), refcnt: 4
 value: E(69), refcnt: 1
 value: F(70), refcnt: 1
 value: N(78), refcnt: 1
 value: O(79), refcnt: 1
 value: R(82), refcnt: 1
 value: T(84), refcnt: 1
 value: W(87), refcnt: 1
 value: X(88), refcnt: 1
 value: _(95), refcnt: 7
 value: a(97), refcnt: 3
 value: c(99), refcnt: 4
 value: d(100), refcnt: 3
 value: e(101), refcnt: 2
 value: f(102), refcnt: 2
 value: g(103), refcnt: 1
 value: i(105), refcnt: 10
 value: k(107), refcnt: 8
 value: l(108), refcnt: 1
 value: m(109), refcnt: 6
 value: n(110), refcnt: 4
 value: o(111), refcnt: 1
 value: p(112), refcnt: 7
 value: r(114), refcnt: 2
 value: s(115), refcnt: 6
 value: t(116), refcnt: 4
 value: u(117), refcnt: 1
 value: w(119), refcnt: 3

通过比较发现两者的结果大体是差不多的，但是第二个结果多了'?'。猜测多有一个'?'是因为t.py 中包含'?'。可是为什么在 Python shell 中，同样的len('？')却没有输出'?'呢？

当 Python shell 启动之后，似乎 characters 被“冻结”了，无法再获取它的值。

>>> len('?')
value: /(47), refcnt: 12
value: 1(49), refcnt: 2
value: E(69), refcnt: 1
value: F(70), refcnt: 1
value: N(78), refcnt: 1
value: O(79), refcnt: 1
value: R(82), refcnt: 1
value: T(84), refcnt: 1
value: W(87), refcnt: 1
value: X(88), refcnt: 1
value: _(95), refcnt: 7
value: a(97), refcnt: 3
value: c(99), refcnt: 4
value: d(100), refcnt: 3
value: e(101), refcnt: 2
value: f(102), refcnt: 2
value: g(103), refcnt: 1
value: i(105), refcnt: 10
value: k(107), refcnt: 8
value: l(108), refcnt: 1
value: m(109), refcnt: 6
value: n(110), refcnt: 4
value: o(111), refcnt: 1
value: p(112), refcnt: 7
value: r(114), refcnt: 2
value: s(115), refcnt: 6
value: t(116), refcnt: 4
value: u(117), refcnt: 1
value: w(119), refcnt: 3
1
>>> a = 'x'
>>> len('?')
value: /(47), refcnt: 12
value: 1(49), refcnt: 2
value: E(69), refcnt: 1
value: F(70), refcnt: 1
value: N(78), refcnt: 1
value: O(79), refcnt: 1
value: R(82), refcnt: 1
value: T(84), refcnt: 1
value: W(87), refcnt: 1
value: X(88), refcnt: 1
value: _(95), refcnt: 8
value: a(97), refcnt: 4
value: c(99), refcnt: 4
value: d(100), refcnt: 3
value: e(101), refcnt: 2
value: f(102), refcnt: 2
value: g(103), refcnt: 1
value: i(105), refcnt: 10
value: k(107), refcnt: 8
value: l(108), refcnt: 1
value: m(109), refcnt: 6
value: n(110), refcnt: 4
value: o(111), refcnt: 1
value: p(112), refcnt: 7
value: r(114), refcnt: 2
value: s(115), refcnt: 6
value: t(116), refcnt: 4
value: u(117), refcnt: 1
value: w(119), refcnt: 3
1

第一次执行len('?')，输出结果中没有'x'，当定义a = 'x'之后，再次执行len('?')，输出结果中仍然没有'x'。从 stringobject.c 的代码来看，创建字符串时，假如长度为1，则会将其缓存到 characters 中。

会不会是只有在 Python 启动时才会缓存 characters。

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Pyhton 中的字符串对象

短字符串缓冲池

修改代码测试

FilesExpand file tree

ch03.md

Latest commit

History

ch03.md

File metadata and controls

Pyhton 中的字符串对象

短字符串缓冲池

修改代码测试