arrow/python/pyarrow/feather.py at master · JoyJava/arrow

History

132 lines (106 loc) · 3.66 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

# Licensed to the Apache Software Foundation (ASF) under one

# or more contributor license agreements. See the NOTICE file

# distributed with this work for additional information

# regarding copyright ownership. The ASF licenses this file

# to you under the Apache License, Version 2.0 (the

# "License"); you may not use this file except in compliance

# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,

# software distributed under the License is distributed on an

# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

# KIND, either express or implied. See the License for the

# specific language governing permissions and limitations

# under the License.

from distutils.version import LooseVersion

import os

import six

import pandas as pd

from pyarrow.compat import pdapi

from pyarrow.lib import FeatherError # noqa

from pyarrow.lib import RecordBatch, Table

import pyarrow.lib as ext

try:

infer_dtype = pdapi.infer_dtype

except AttributeError:

infer_dtype = pd.lib.infer_dtype

if LooseVersion(pd.__version__) < '0.17.0':

raise ImportError("feather requires pandas >= 0.17.0")

class FeatherReader(ext.FeatherReader):

def __init__(self, source):

self.source = source

self.open(source)

def read(self, columns=None, nthreads=1):

if columns is not None:

column_set = set(columns)

else:

column_set = None

columns = []

names = []

for i in range(self.num_columns):

name = self.get_column_name(i)

if column_set is None or name in column_set:

col = self.get_column(i)

columns.append(col)

names.append(name)

table = Table.from_arrays(columns, names=names)

return table.to_pandas(nthreads=nthreads)

class FeatherWriter(object):

def __init__(self, dest):

self.dest = dest

self.writer = ext.FeatherWriter()

self.writer.open(dest)

def write(self, df):

if isinstance(df, pd.SparseDataFrame):

df = df.to_dense()

if not df.columns.is_unique:

raise ValueError("cannot serialize duplicate column names")

# TODO(wesm): Remove this length check, see ARROW-1732

if len(df.columns) > 0:

batch = RecordBatch.from_pandas(df, preserve_index=False)

for i, name in enumerate(batch.schema.names):

col = batch[i]

self.writer.write_array(name, col)

self.writer.close()

def write_feather(df, dest):

"""

Write a pandas.DataFrame to Feather format

Parameters

----------

df : pandas.DataFrame

dest : string

Local file path

"""

writer = FeatherWriter(dest)

try:

writer.write(df)

except Exception:

# Try to make sure the resource is closed

import gc

writer = None

gc.collect()

if isinstance(dest, six.string_types):

try:

os.remove(dest)

except os.error:

pass

raise

def read_feather(source, columns=None, nthreads=1):

"""

Read a pandas.DataFrame from Feather format

Parameters

----------

source : string file path, or file-like object

columns : sequence, optional

Only read a specific set of columns. If not provided, all columns are

read

nthreads : int, default 1

Number of CPU threads to use when reading to pandas.DataFrame

Returns

-------

df : pandas.DataFrame

"""

reader = FeatherReader(source)

return reader.read(columns=columns, nthreads=nthreads)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

feather.py

Latest commit

History

feather.py

File metadata and controls