arrow/python/pyarrow/ipc.py at master · JoyJava/arrow

History

189 lines (153 loc) · 5.74 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

# Licensed to the Apache Software Foundation (ASF) under one

# or more contributor license agreements. See the NOTICE file

# distributed with this work for additional information

# regarding copyright ownership. The ASF licenses this file

# to you under the Apache License, Version 2.0 (the

# "License"); you may not use this file except in compliance

# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,

# software distributed under the License is distributed on an

# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

# KIND, either express or implied. See the License for the

# specific language governing permissions and limitations

# under the License.

# Arrow file and stream reader/writer classes, and other messaging tools

import pyarrow as pa

from pyarrow.lib import (Message, MessageReader, # noqa

read_message, read_record_batch, read_schema,

read_tensor, write_tensor,

get_record_batch_size, get_tensor_size)

import pyarrow.lib as lib

class _ReadPandasOption(object):

def read_pandas(self, **options):

"""

Read contents of stream and convert to pandas.DataFrame using

Table.to_pandas

Parameters

----------

**options : arguments to forward to Table.to_pandas

Returns

-------

df : pandas.DataFrame

"""

table = self.read_all()

return table.to_pandas(**options)

class RecordBatchStreamReader(lib._RecordBatchReader, _ReadPandasOption):

"""

Reader for the Arrow streaming binary format

Parameters

----------

source : str, pyarrow.NativeFile, or file-like Python object

Either a file path, or a readable file object

"""

def __init__(self, source):

self._open(source)

class RecordBatchStreamWriter(lib._RecordBatchWriter):

"""

Writer for the Arrow streaming binary format

Parameters

----------

sink : str, pyarrow.NativeFile, or file-like Python object

Either a file path, or a writeable file object

schema : pyarrow.Schema

The Arrow schema for data to be written to the file

"""

def __init__(self, sink, schema):

self._open(sink, schema)

class RecordBatchFileReader(lib._RecordBatchFileReader, _ReadPandasOption):

"""

Class for reading Arrow record batch data from the Arrow binary file format

Parameters

----------

source : str, pyarrow.NativeFile, or file-like Python object

Either a file path, or a readable file object

footer_offset : int, default None

If the file is embedded in some larger file, this is the byte offset to

the very end of the file data

"""

def __init__(self, source, footer_offset=None):

self._open(source, footer_offset=footer_offset)

class RecordBatchFileWriter(lib._RecordBatchFileWriter):

"""

Writer to create the Arrow binary file format

Parameters

----------

sink : str, pyarrow.NativeFile, or file-like Python object

Either a file path, or a writeable file object

schema : pyarrow.Schema

The Arrow schema for data to be written to the file

"""

def __init__(self, sink, schema):

self._open(sink, schema)

def open_stream(source):

"""

Create reader for Arrow streaming format

Parameters

----------

source : str, pyarrow.NativeFile, or file-like Python object

Either a file path, or a readable file object

footer_offset : int, default None

If the file is embedded in some larger file, this is the byte offset to

the very end of the file data

Returns

-------

reader : RecordBatchStreamReader

"""

return RecordBatchStreamReader(source)

def open_file(source, footer_offset=None):

"""

Create reader for Arrow file format

Parameters

----------

source : str, pyarrow.NativeFile, or file-like Python object

Either a file path, or a readable file object

footer_offset : int, default None

If the file is embedded in some larger file, this is the byte offset to

the very end of the file data

Returns

-------

reader : RecordBatchFileReader

"""

return RecordBatchFileReader(source, footer_offset=footer_offset)

def serialize_pandas(df, nthreads=None, preserve_index=True):

"""Serialize a pandas DataFrame into a buffer protocol compatible object.

Parameters

----------

df : pandas.DataFrame

nthreads : int, default None

Number of threads to use for conversion to Arrow, default all CPUs

preserve_index : boolean, default True

If True, preserve the pandas index data, otherwise the result will have

a default RangeIndex

Returns

-------

buf : buffer

An object compatible with the buffer protocol

"""

batch = pa.RecordBatch.from_pandas(df, nthreads=nthreads,

preserve_index=preserve_index)

sink = pa.BufferOutputStream()

writer = pa.RecordBatchStreamWriter(sink, batch.schema)

writer.write_batch(batch)

writer.close()

return sink.get_result()

def deserialize_pandas(buf, nthreads=None):

"""Deserialize a buffer protocol compatible object into a pandas DataFrame.

Parameters

----------

buf : buffer

An object compatible with the buffer protocol

nthreads : int, defualt None

The number of threads to use to convert the buffer to a DataFrame,

default all CPUs

Returns

-------

df : pandas.DataFrame

"""

buffer_reader = pa.BufferReader(buf)

reader = pa.RecordBatchStreamReader(buffer_reader)

table = reader.read_all()

return table.to_pandas(nthreads=nthreads)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

ipc.py

Latest commit

History

ipc.py

File metadata and controls