"""test orc compat""" import datetime from decimal import Decimal from io import BytesIO import os import numpy as np import pytest import pandas as pd from pandas import read_orc import pandas._testing as tm pytest.importorskip("pyarrow.orc") import pyarrow as pa pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) @pytest.fixture def dirpath(datapath): return datapath("io", "data", "orc") def test_orc_reader_empty(dirpath, using_infer_string): columns = [ "boolean1", "byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", ] dtypes = [ "bool", "int8", "int16", "int32", "int64", "float32", "float64", "object", "str" if using_infer_string else "object", ] expected = pd.DataFrame(index=pd.RangeIndex(0)) for colname, dtype in zip(columns, dtypes, strict=True): expected[colname] = pd.Series(dtype=dtype) expected.columns = expected.columns.astype("str") inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") got = read_orc(inputfile, columns=columns) tm.assert_equal(expected, got) def test_orc_reader_basic(dirpath): data = { "boolean1": np.array([False, True], dtype="bool"), "byte1": np.array([1, 100], dtype="int8"), "short1": np.array([1024, 2048], dtype="int16"), "int1": np.array([65536, 65536], dtype="int32"), "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), "float1": np.array([1.0, 2.0], dtype="float32"), "double1": np.array([-15.0, -5.0], dtype="float64"), "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), "string1": np.array(["hi", "bye"], dtype="object"), } expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc") got = read_orc(inputfile, columns=data.keys()) tm.assert_equal(expected, got) def test_orc_reader_decimal(dirpath): # Only testing the first 10 rows of data data = { "_col0": np.array( [ Decimal("-1000.50000"), Decimal("-999.60000"), Decimal("-998.70000"), Decimal("-997.80000"), Decimal("-996.90000"), Decimal("-995.10000"), Decimal("-994.11000"), Decimal("-993.12000"), Decimal("-992.13000"), Decimal("-991.14000"), ], dtype="object", ) } expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc") got = read_orc(inputfile).iloc[:10] tm.assert_equal(expected, got) def test_orc_reader_date_low(dirpath): data = { "time": np.array( [ "1900-05-05 12:34:56.100000", "1900-05-05 12:34:56.100100", "1900-05-05 12:34:56.100200", "1900-05-05 12:34:56.100300", "1900-05-05 12:34:56.100400", "1900-05-05 12:34:56.100500", "1900-05-05 12:34:56.100600", "1900-05-05 12:34:56.100700", "1900-05-05 12:34:56.100800", "1900-05-05 12:34:56.100900", ], dtype="datetime64[ns]", ), "date": np.array( [ datetime.date(1900, 12, 25), datetime.date(1900, 12, 25), datetime.date(1900, 12, 25), datetime.date(1900, 12, 25), datetime.date(1900, 12, 25), datetime.date(1900, 12, 25), datetime.date(1900, 12, 25), datetime.date(1900, 12, 25), datetime.date(1900, 12, 25), datetime.date(1900, 12, 25), ], dtype="object", ), } expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc") got = read_orc(inputfile).iloc[:10] tm.assert_equal(expected, got) def test_orc_reader_date_high(dirpath): data = { "time": np.array( [ "2038-05-05 12:34:56.100000", "2038-05-05 12:34:56.100100", "2038-05-05 12:34:56.100200", "2038-05-05 12:34:56.100300", "2038-05-05 12:34:56.100400", "2038-05-05 12:34:56.100500", "2038-05-05 12:34:56.100600", "2038-05-05 12:34:56.100700", "2038-05-05 12:34:56.100800", "2038-05-05 12:34:56.100900", ], dtype="datetime64[ns]", ), "date": np.array( [ datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), datetime.date(2038, 12, 25), ], dtype="object", ), } expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc") got = read_orc(inputfile).iloc[:10] tm.assert_equal(expected, got) def test_orc_reader_snappy_compressed(dirpath): data = { "int1": np.array( [ -1160101563, 1181413113, 2065821249, -267157795, 172111193, 1752363137, 1406072123, 1911809390, -1308542224, -467100286, ], dtype="int32", ), "string1": np.array( [ "f50dcb8", "382fdaaa", "90758c6", "9e8caf3f", "ee97332b", "d634da1", "2bea4396", "d67d89e8", "ad71007e", "e8c82066", ], dtype="object", ), } expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc") got = read_orc(inputfile).iloc[:10] tm.assert_equal(expected, got) def test_orc_roundtrip_file(dirpath, temp_file): # GH44554 # PyArrow gained ORC write support with the current argument order pytest.importorskip("pyarrow") data = { "boolean1": np.array([False, True], dtype="bool"), "byte1": np.array([1, 100], dtype="int8"), "short1": np.array([1024, 2048], dtype="int16"), "int1": np.array([65536, 65536], dtype="int32"), "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), "float1": np.array([1.0, 2.0], dtype="float32"), "double1": np.array([-15.0, -5.0], dtype="float64"), "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), "string1": np.array(["hi", "bye"], dtype="object"), } expected = pd.DataFrame.from_dict(data) expected.to_orc(temp_file) got = read_orc(temp_file) tm.assert_equal(expected, got) def test_orc_roundtrip_bytesio(): # GH44554 # PyArrow gained ORC write support with the current argument order pytest.importorskip("pyarrow") data = { "boolean1": np.array([False, True], dtype="bool"), "byte1": np.array([1, 100], dtype="int8"), "short1": np.array([1024, 2048], dtype="int16"), "int1": np.array([65536, 65536], dtype="int32"), "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), "float1": np.array([1.0, 2.0], dtype="float32"), "double1": np.array([-15.0, -5.0], dtype="float64"), "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), "string1": np.array(["hi", "bye"], dtype="object"), } expected = pd.DataFrame.from_dict(data) bytes = expected.to_orc() got = read_orc(BytesIO(bytes)) tm.assert_equal(expected, got) @pytest.mark.parametrize( "orc_writer_dtypes_not_supported", [ np.array([1, 20], dtype="uint64"), pd.Series(["a", "b", "a"], dtype="category"), [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)], [pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")], ], ) def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported): # GH44554 # PyArrow gained ORC write support with the current argument order pytest.importorskip("pyarrow") df = pd.DataFrame({"unimpl": orc_writer_dtypes_not_supported}) msg = "The dtype of one or more columns is not supported yet." with pytest.raises(NotImplementedError, match=msg): df.to_orc() def test_orc_dtype_backend_pyarrow(using_infer_string): pytest.importorskip("pyarrow") df = pd.DataFrame( { "string": list("abc"), "string_with_nan": ["a", np.nan, "c"], "string_with_none": ["a", None, "c"], "bytes": [b"foo", b"bar", None], "int": list(range(1, 4)), "float": np.arange(4.0, 7.0, dtype="float64"), "float_with_nan": [2.0, np.nan, 3.0], "bool": [True, False, True], "bool_with_na": [True, False, None], "datetime": pd.date_range("20130101", periods=3, unit="ns"), "datetime_with_nat": [ pd.Timestamp("20130101"), pd.NaT, pd.Timestamp("20130103"), ], } ) # FIXME: without casting to ns we do not round-trip correctly df["datetime_with_nat"] = df["datetime_with_nat"].astype("M8[ns]") bytes_data = df.copy().to_orc() result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow") expected = pd.DataFrame( { col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True)) for col in df.columns } ) if using_infer_string: # ORC does not preserve distinction between string and large string # -> the default large string comes back as string string_dtype = pd.ArrowDtype(pa.string()) expected["string"] = expected["string"].astype(string_dtype) expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype) expected["string_with_none"] = expected["string_with_none"].astype(string_dtype) tm.assert_frame_equal(result, expected) def test_orc_dtype_backend_numpy_nullable(): # GH#50503 pytest.importorskip("pyarrow") df = pd.DataFrame( { "string": list("abc"), "string_with_nan": ["a", np.nan, "c"], "string_with_none": ["a", None, "c"], "int": list(range(1, 4)), "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"), "float": np.arange(4.0, 7.0, dtype="float64"), "float_with_nan": [2.0, np.nan, 3.0], "bool": [True, False, True], "bool_with_na": [True, False, None], } ) bytes_data = df.copy().to_orc() result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable") expected = pd.DataFrame( { "string": pd.array(["a", "b", "c"], dtype=pd.StringDtype()), "string_with_nan": pd.array(["a", pd.NA, "c"], dtype=pd.StringDtype()), "string_with_none": pd.array(["a", pd.NA, "c"], dtype=pd.StringDtype()), "int": pd.Series([1, 2, 3], dtype="Int64"), "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"), "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"), "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"), "bool": pd.Series([True, False, True], dtype="boolean"), "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"), } ) tm.assert_frame_equal(result, expected) def test_orc_uri_path(temp_file): expected = pd.DataFrame({"int": list(range(1, 4))}) expected.to_orc(temp_file) uri = temp_file.as_uri() result = read_orc(uri) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "index", [ pd.RangeIndex(start=2, stop=5, step=1), pd.RangeIndex(start=0, stop=3, step=1, name="non-default"), pd.Index([1, 2, 3]), ], ) def test_to_orc_non_default_index(index): df = pd.DataFrame({"a": [1, 2, 3]}, index=index) msg = ( "orc does not support serializing a non-default index|" "orc does not serialize index meta-data" ) with pytest.raises(ValueError, match=msg): df.to_orc() def test_invalid_dtype_backend(temp_file): msg = ( "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) df = pd.DataFrame({"int": list(range(1, 4))}) df.to_orc(temp_file) with pytest.raises(ValueError, match=msg): read_orc(temp_file, dtype_backend="numpy") def test_string_inference(temp_file): # GH#54431 df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_orc(temp_file) with pd.option_context("future.infer_string", True): result = read_orc(temp_file) expected = pd.DataFrame( data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan), columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected)