import datetime from datetime import timedelta import re import numpy as np import pytest from pandas._libs.tslibs import Timestamp from pandas.compat import PY312 import pandas as pd from pandas import ( DataFrame, Index, Series, _testing as tm, concat, date_range, read_hdf, ) pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @pytest.mark.filterwarnings("ignore::tables.NaturalNameWarning") def test_append(temp_hdfstore): # this is allowed by almost always don't want to do it # tables.NaturalNameWarning): df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) temp_hdfstore.append("df1", df[:10]) temp_hdfstore.append("df1", df[10:]) tm.assert_frame_equal(temp_hdfstore["df1"], df) temp_hdfstore.put("df2", df[:10], format="table") temp_hdfstore.append("df2", df[10:]) tm.assert_frame_equal(temp_hdfstore["df2"], df) temp_hdfstore.append("/df3", df[:10]) temp_hdfstore.append("/df3", df[10:]) tm.assert_frame_equal(temp_hdfstore["df3"], df) # this is allowed by almost always don't want to do it # tables.NaturalNameWarning temp_hdfstore.append("/df3 foo", df[:10]) temp_hdfstore.append("/df3 foo", df[10:]) tm.assert_frame_equal(temp_hdfstore["df3 foo"], df) # dtype issues - mizxed type in a single object column df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) df["mixed_column"] = "testing" df.loc[2, "mixed_column"] = np.nan temp_hdfstore.append("df", df) tm.assert_frame_equal(temp_hdfstore["df"], df) # uints - test storage of uints uint_data = DataFrame( { "u08": Series( np.random.default_rng(2).integers(0, high=255, size=5), dtype=np.uint8, ), "u16": Series( np.random.default_rng(2).integers(0, high=65535, size=5), dtype=np.uint16, ), "u32": Series( np.random.default_rng(2).integers(0, high=2**30, size=5), dtype=np.uint32, ), "u64": Series( [2**58, 2**59, 2**60, 2**61, 2**62], dtype=np.uint64, ), }, index=np.arange(5), ) temp_hdfstore.append("uints", uint_data) tm.assert_frame_equal(temp_hdfstore["uints"], uint_data, check_index_type=True) # uints - test storage of uints in indexable columns temp_hdfstore.remove("uints") # 64-bit indices not yet supported temp_hdfstore.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) tm.assert_frame_equal(temp_hdfstore["uints"], uint_data, check_index_type=True) def test_append_series(temp_hdfstore): # basic ss = Series(range(20), dtype=np.float64, index=[f"i_{i}" for i in range(20)]) ts = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) ns = Series(np.arange(100)) temp_hdfstore.append("ss", ss) result = temp_hdfstore["ss"] tm.assert_series_equal(result, ss) assert result.name is None temp_hdfstore.append("ts", ts) result = temp_hdfstore["ts"] tm.assert_series_equal(result, ts) assert result.name is None ns.name = "foo" temp_hdfstore.append("ns", ns) result = temp_hdfstore["ns"] tm.assert_series_equal(result, ns) assert result.name == ns.name # select on the values expected = ns[ns > 60] result = temp_hdfstore.select("ns", "foo>60") tm.assert_series_equal(result, expected) # select on the index and values expected = ns[(ns > 70) & (ns.index < 90)] # Reading/writing RangeIndex info is not supported yet expected.index = Index(expected.index._data) result = temp_hdfstore.select("ns", "foo>70 and index<90") tm.assert_series_equal(result, expected, check_index_type=True) # multi-index mi = DataFrame(np.random.default_rng(2).standard_normal((5, 1)), columns=["A"]) mi["B"] = np.arange(len(mi)) mi["C"] = "foo" mi.loc[3:5, "C"] = "bar" mi.set_index(["C", "B"], inplace=True) s = mi.stack() s.index = s.index.droplevel(2) temp_hdfstore.append("mi", s) tm.assert_series_equal(temp_hdfstore["mi"], s, check_index_type=True) def test_append_some_nans(temp_hdfstore): df = DataFrame( { "A": Series(np.random.default_rng(2).standard_normal(20)).astype("int32"), "A1": np.random.default_rng(2).standard_normal(20), "A2": np.random.default_rng(2).standard_normal(20), "B": "foo", "C": "bar", "D": Timestamp("2001-01-01").as_unit("ns"), "E": Timestamp("2001-01-02").as_unit("ns"), }, index=np.arange(20), ) # some nans df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan temp_hdfstore.append("df1", df[:10]) temp_hdfstore.append("df1", df[10:]) tm.assert_frame_equal(temp_hdfstore["df1"], df, check_index_type=True) # first column df1 = df.copy() df1["A1"] = np.nan temp_hdfstore.remove("df1") temp_hdfstore.append("df1", df1[:10]) temp_hdfstore.append("df1", df1[10:]) tm.assert_frame_equal(temp_hdfstore["df1"], df1, check_index_type=True) # 2nd column df2 = df.copy() df2["A2"] = np.nan temp_hdfstore.append("df2", df2[:10]) temp_hdfstore.append("df2", df2[10:]) tm.assert_frame_equal(temp_hdfstore["df2"], df2, check_index_type=True) # datetimes df3 = df.copy() df3["E"] = np.nan temp_hdfstore.append("df3", df3[:10]) temp_hdfstore.append("df3", df3[10:]) tm.assert_frame_equal(temp_hdfstore["df3"], df3, check_index_type=True) def test_append_all_nans(temp_hdfstore, using_infer_string): df = DataFrame( { "A1": np.random.default_rng(2).standard_normal(20), "A2": np.random.default_rng(2).standard_normal(20), }, index=np.arange(20), ) df.loc[0:15, :] = np.nan # nan some entire rows (dropna=True) temp_hdfstore.append("df", df[:10], dropna=True) temp_hdfstore.append("df", df[10:], dropna=True) tm.assert_frame_equal(temp_hdfstore["df"], df[-4:], check_index_type=True) # nan some entire rows (dropna=False) temp_hdfstore.append("df2", df[:10], dropna=False) temp_hdfstore.append("df2", df[10:], dropna=False) tm.assert_frame_equal(temp_hdfstore["df2"], df, check_index_type=True) # tests the option io.hdf.dropna_table with pd.option_context("io.hdf.dropna_table", False): temp_hdfstore.append("df3", df[:10]) temp_hdfstore.append("df3", df[10:]) tm.assert_frame_equal(temp_hdfstore["df3"], df) with pd.option_context("io.hdf.dropna_table", True): temp_hdfstore.append("df4", df[:10]) temp_hdfstore.append("df4", df[10:]) tm.assert_frame_equal(temp_hdfstore["df4"], df[-4:]) # nan some entire rows (string are still written!) df = DataFrame( { "A1": np.random.default_rng(2).standard_normal(20), "A2": np.random.default_rng(2).standard_normal(20), "B": "foo", "C": "bar", }, index=np.arange(20), ) df.loc[0:15, :] = np.nan temp_hdfstore.remove("df") temp_hdfstore.append("df", df[:10], dropna=True) temp_hdfstore.append("df", df[10:], dropna=True) result = temp_hdfstore["df"] expected = df if using_infer_string: # TODO: Test is incorrect when not using_infer_string. # Should take the last 4 rows uncondiationally. expected = expected[-4:] tm.assert_frame_equal(result, expected, check_index_type=True) temp_hdfstore.remove("df2") temp_hdfstore.append("df2", df[:10], dropna=False) temp_hdfstore.append("df2", df[10:], dropna=False) tm.assert_frame_equal(temp_hdfstore["df2"], df, check_index_type=True) # nan some entire rows (but since we have dates they are still # written!) df = DataFrame( { "A1": np.random.default_rng(2).standard_normal(20), "A2": np.random.default_rng(2).standard_normal(20), "B": "foo", "C": "bar", "D": Timestamp("2001-01-01").as_unit("ns"), "E": Timestamp("2001-01-02").as_unit("ns"), }, index=np.arange(20), ) df.loc[0:15, :] = np.nan temp_hdfstore.remove("df") temp_hdfstore.append("df", df[:10], dropna=True) temp_hdfstore.append("df", df[10:], dropna=True) tm.assert_frame_equal(temp_hdfstore["df"], df, check_index_type=True) temp_hdfstore.remove("df2") temp_hdfstore.append("df2", df[:10], dropna=False) temp_hdfstore.append("df2", df[10:], dropna=False) tm.assert_frame_equal(temp_hdfstore["df2"], df, check_index_type=True) def test_append_frame_column_oriented(temp_hdfstore, request): # column oriented df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.index = df.index._with_freq(None) # freq doesn't round-trip temp_hdfstore.append("df1", df.iloc[:, :2], axes=["columns"]) temp_hdfstore.append("df1", df.iloc[:, 2:]) tm.assert_frame_equal(temp_hdfstore["df1"], df) result = temp_hdfstore.select("df1", "columns=A") expected = df.reindex(columns=["A"]) tm.assert_frame_equal(expected, result) # selection on the non-indexable request.applymarker( pytest.mark.xfail( PY312, reason="AST change in PY312", raises=ValueError, ) ) result = temp_hdfstore.select("df1", ("columns=A", "index=df.index[0:4]")) expected = df.reindex(columns=["A"], index=df.index[0:4]) tm.assert_frame_equal(expected, result) # this isn't supported msg = re.escape( "passing a filterable condition to a non-table indexer " "[Filter: Not Initialized]" ) with pytest.raises(TypeError, match=msg): temp_hdfstore.select("df1", "columns=A and index>df.index[4]") def test_append_with_different_block_ordering(temp_hdfstore): # GH 4096; using same frames, but different block orderings for i in range(10): df = DataFrame( np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") ) df["index"] = range(10) df["index"] += i * 10 df["int64"] = Series([1] * len(df), dtype="int64") df["int16"] = Series([1] * len(df), dtype="int16") if i % 2 == 0: del df["int64"] df["int64"] = Series([1] * len(df), dtype="int64") if i % 3 == 0: a = df.pop("A") df["A"] = a df.set_index("index", inplace=True) temp_hdfstore.append("df", df) # test a different ordering but with more fields (like invalid # combinations) df = DataFrame( np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB"), dtype="float64", ) df["int64"] = Series([1] * len(df), dtype="int64") df["int16"] = Series([1] * len(df), dtype="int16") temp_hdfstore.remove("df") temp_hdfstore.append("df", df) # store additional fields in different blocks df["int16_2"] = Series([1] * len(df), dtype="int16") msg = re.escape( "cannot match existing table structure for [int16] on appending data" ) with pytest.raises(ValueError, match=msg): temp_hdfstore.append("df", df) # store multiple additional fields in different blocks df["float_3"] = Series([1.0] * len(df), dtype="float64") msg = re.escape("cannot match existing table structure for [A,B] on appending data") with pytest.raises(ValueError, match=msg): temp_hdfstore.append("df", df) def test_append_with_strings(temp_hdfstore): def check_col(key, name, size): assert ( getattr(temp_hdfstore.get_storer(key).table.description, name).itemsize == size ) # avoid truncation on elements df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) temp_hdfstore.append("df_big", df) tm.assert_frame_equal(temp_hdfstore.select("df_big"), df) check_col("df_big", "values_block_1", 15) # appending smaller string ok df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) temp_hdfstore.append("df_big", df2) expected = concat([df, df2]) tm.assert_frame_equal(temp_hdfstore.select("df_big"), expected) check_col("df_big", "values_block_1", 15) # avoid truncation on elements df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) temp_hdfstore.append("df_big2", df, min_itemsize={"values": 50}) tm.assert_frame_equal(temp_hdfstore.select("df_big2"), df) check_col("df_big2", "values_block_1", 50) # bigger string on next append temp_hdfstore.append("df_new", df) df_new = DataFrame([[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]]) msg = ( r"Trying to store a string with len \[26\] in " r"\[values_block_1\] column but\n" r"this column has a limit of \[15\]!\n" "Consider using min_itemsize to preset the sizes on these " "columns" ) with pytest.raises(ValueError, match=msg): temp_hdfstore.append("df_new", df_new) # min_itemsize on Series index (GH 11412) df = DataFrame( { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") temp_hdfstore.append("ss", df["B"], min_itemsize={"index": 4}) tm.assert_series_equal(temp_hdfstore.select("ss"), df["B"]) # same as above, with data_columns=True temp_hdfstore.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4}) tm.assert_series_equal(temp_hdfstore.select("ss2"), df["B"]) # min_itemsize in index without appending (GH 10381) temp_hdfstore.put("ss3", df, format="table", min_itemsize={"index": 6}) # just make sure there is a longer string: df2 = df.copy().reset_index().assign(C="longer").set_index("C") temp_hdfstore.append("ss3", df2) tm.assert_frame_equal(temp_hdfstore.select("ss3"), concat([df, df2])) # same as above, with a Series temp_hdfstore.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) temp_hdfstore.append("ss4", df2["B"]) tm.assert_series_equal(temp_hdfstore.select("ss4"), concat([df["B"], df2["B"]])) # with nans df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" df.loc[df.index[1:4], "string"] = np.nan df["string2"] = "bar" df.loc[df.index[4:8], "string2"] = np.nan df["string3"] = "bah" df.loc[df.index[1:], "string3"] = np.nan temp_hdfstore.append("df", df) result = temp_hdfstore.select("df") tm.assert_frame_equal(result, df) def test_append_with_strings2(temp_hdfstore): def check_col(key, name, size): assert ( getattr(temp_hdfstore.get_storer(key).table.description, name).itemsize == size ) df = DataFrame({"A": "foo", "B": "bar"}, index=range(10)) # a min_itemsize that creates a data_column temp_hdfstore.append("df", df, min_itemsize={"A": 200}) check_col("df", "A", 200) assert temp_hdfstore.get_storer("df").data_columns == ["A"] # a min_itemsize that creates a data_column2 temp_hdfstore.remove("df") temp_hdfstore.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) check_col("df", "A", 200) assert temp_hdfstore.get_storer("df").data_columns == ["B", "A"] # a min_itemsize that creates a data_column2 temp_hdfstore.remove("df") temp_hdfstore.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) check_col("df", "B", 200) check_col("df", "values_block_0", 200) assert temp_hdfstore.get_storer("df").data_columns == ["B"] # infer the .typ on subsequent appends temp_hdfstore.remove("df") temp_hdfstore.append("df", df[:5], min_itemsize=200) temp_hdfstore.append("df", df[5:], min_itemsize=200) tm.assert_frame_equal(temp_hdfstore["df"], df) # invalid min_itemsize keys df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) temp_hdfstore.remove("df") msg = re.escape( "min_itemsize has the key [foo] which is not an axis or data_column" ) with pytest.raises(ValueError, match=msg): temp_hdfstore.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) def test_append_with_empty_string(temp_hdfstore): # with all empty strings (GH 12242) df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) temp_hdfstore.append("df", df[:-1], min_itemsize={"x": 1}) temp_hdfstore.append("df", df[-1:], min_itemsize={"x": 1}) tm.assert_frame_equal(temp_hdfstore.select("df"), df) def test_append_with_data_columns(temp_hdfstore): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B", unit="ns"), ) df.iloc[0, df.columns.get_loc("B")] = 1.0 temp_hdfstore.append("df", df[:2], data_columns=["B"]) temp_hdfstore.append("df", df[2:]) tm.assert_frame_equal(temp_hdfstore["df"], df) # check that we have indices created assert temp_hdfstore._handle.root.df.table.cols.index.is_indexed is True assert temp_hdfstore._handle.root.df.table.cols.B.is_indexed is True # data column searching result = temp_hdfstore.select("df", "B>0") expected = df[df.B > 0] tm.assert_frame_equal(result, expected) # data column searching (with an indexable and a data_columns) result = temp_hdfstore.select("df", "B>0 and index>df.index[3]") df_new = df.reindex(index=df.index[4:]) expected = df_new[df_new.B > 0] tm.assert_frame_equal(result, expected) # data column selection with a string data_column df_new = df.copy() df_new["string"] = "foo" df_new.loc[df_new.index[1:4], "string"] = np.nan df_new.loc[df_new.index[5:6], "string"] = "bar" temp_hdfstore.remove("df") temp_hdfstore.append("df", df_new, data_columns=["string"]) result = temp_hdfstore.select("df", "string='foo'") expected = df_new[df_new.string == "foo"] tm.assert_frame_equal(result, expected) # using min_itemsize and a data column def check_col(key, name, size): assert ( getattr(temp_hdfstore.get_storer(key).table.description, name).itemsize == size ) temp_hdfstore.remove("df") temp_hdfstore.append( "df", df_new, data_columns=["string"], min_itemsize={"string": 30} ) check_col("df", "string", 30) temp_hdfstore.remove("df") temp_hdfstore.append("df", df_new, data_columns=["string"], min_itemsize=30) check_col("df", "string", 30) temp_hdfstore.remove("df") temp_hdfstore.append( "df", df_new, data_columns=["string"], min_itemsize={"values": 30} ) check_col("df", "string", 30) df_new["string2"] = "foobarbah" df_new["string_block1"] = "foobarbah1" df_new["string_block2"] = "foobarbah2" temp_hdfstore.remove("df") temp_hdfstore.append( "df", df_new, data_columns=["string", "string2"], min_itemsize={"string": 30, "string2": 40, "values": 50}, ) check_col("df", "string", 30) check_col("df", "string2", 40) check_col("df", "values_block_1", 50) # multiple data columns df_new = df.copy() df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 df_new["string"] = "foo" sl = df_new.columns.get_loc("string") df_new.iloc[1:4, sl] = np.nan df_new.iloc[5:6, sl] = "bar" df_new["string2"] = "foo" sl = df_new.columns.get_loc("string2") df_new.iloc[2:5, sl] = np.nan df_new.iloc[7:8, sl] = "bar" temp_hdfstore.remove("df") temp_hdfstore.append("df", df_new, data_columns=["A", "B", "string", "string2"]) result = temp_hdfstore.select( "df", "string='foo' and string2='foo' and A>0 and B<0" ) expected = df_new[ (df_new.string == "foo") & (df_new.string2 == "foo") & (df_new.A > 0) & (df_new.B < 0) ] tm.assert_frame_equal(result, expected, check_freq=False) # FIXME: 2020-05-07 freq check randomly fails in the CI # yield an empty frame result = temp_hdfstore.select("df", "string='foo' and string2='cool'") expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] tm.assert_frame_equal(result, expected) # doc example df_dc = df.copy() df_dc["string"] = "foo" df_dc.loc[df_dc.index[4:6], "string"] = np.nan df_dc.loc[df_dc.index[7:9], "string"] = "bar" df_dc["string2"] = "cool" df_dc["datetime"] = Timestamp("20010102").as_unit("ns") df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan temp_hdfstore.append( "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] ) result = temp_hdfstore.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) result = temp_hdfstore.select("df_dc", ["B > 0", "C > 0", "string == foo"]) expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected, check_freq=False) # FIXME: 2020-12-07 intermittent build failures here with freq of # None instead of BDay(4) # doc example part 2 index = date_range("1/1/2000", periods=8) df_dc = DataFrame( np.random.default_rng(2).standard_normal((8, 3)), index=index, columns=["A", "B", "C"], ) df_dc["string"] = "foo" df_dc.loc[df_dc.index[4:6], "string"] = np.nan df_dc.loc[df_dc.index[7:9], "string"] = "bar" df_dc[["B", "C"]] = df_dc[["B", "C"]].abs() df_dc["string2"] = "cool" # on-disk operations temp_hdfstore.remove("df_dc") temp_hdfstore.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) result = temp_hdfstore.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) result = temp_hdfstore.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected) def test_append_hierarchical(temp_hdfstore, multiindex_dataframe_random_data): df = multiindex_dataframe_random_data df.columns.name = None temp_hdfstore.append("mi", df) result = temp_hdfstore.select("mi") tm.assert_frame_equal(result, df) # GH 3748 result = temp_hdfstore.select("mi", columns=["A", "B"]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) df.to_hdf(temp_hdfstore, key="df", format="table") result = read_hdf(temp_hdfstore, "df", columns=["A", "B"]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_append_misc(temp_hdfstore): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), index=Index([f"i-{i}" for i in range(30)]), ) temp_hdfstore.append("df", df, chunksize=1) result = temp_hdfstore.select("df") tm.assert_frame_equal(result, df) temp_hdfstore.append("df1", df, expectedrows=10) result = temp_hdfstore.select("df1") tm.assert_frame_equal(result, df) @pytest.mark.parametrize("chunksize", [10, 200, 1000]) def test_append_misc_chunksize(temp_hdfstore, chunksize): # more chunksize in append tests df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), index=Index([f"i-{i}" for i in range(30)]), ) df["string"] = "foo" df["float322"] = 1.0 df["float322"] = df["float322"].astype("float32") df["bool"] = df["float322"] > 0 df["time1"] = Timestamp("20130101").as_unit("ns") df["time2"] = Timestamp("20130102").as_unit("ns") temp_hdfstore.append("obj", df, chunksize=chunksize) result = temp_hdfstore.select("obj") tm.assert_frame_equal(result, df) def test_append_misc_empty_frame(temp_hdfstore): # empty frame, GH4273 # 0 len df_empty = DataFrame(columns=list("ABC")) temp_hdfstore.append("df", df_empty) with pytest.raises(KeyError, match="'No object named df in the file'"): temp_hdfstore.select("df") # repeated append of 0/non-zero frames df = DataFrame(np.random.default_rng(2).random((10, 3)), columns=list("ABC")) temp_hdfstore.append("df", df) tm.assert_frame_equal(temp_hdfstore.select("df"), df) temp_hdfstore.append("df", df_empty) tm.assert_frame_equal(temp_hdfstore.select("df"), df) # store df = DataFrame(columns=list("ABC")) temp_hdfstore.put("df2", df) tm.assert_frame_equal(temp_hdfstore.select("df2"), df) def test_append_raise(temp_hdfstore, using_infer_string): # test append with invalid input to get good error messages # list in column df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), index=Index([f"i-{i}" for i in range(30)]), ) df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ msg = re.escape( """Cannot serialize the column [invalid] because its data contents are not [string] but [mixed] object dtype""" ) with pytest.raises(TypeError, match=msg): temp_hdfstore.append("df", df) # multiple invalid columns df["invalid2"] = [["a"]] * len(df) df["invalid3"] = [["a"]] * len(df) with pytest.raises(TypeError, match=msg): temp_hdfstore.append("df", df) # datetime with embedded nans as object df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), index=Index([f"i-{i}" for i in range(30)]), ) s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) s[0:5] = np.nan df["invalid"] = s assert df.dtypes["invalid"] == np.object_ msg = "too many timezones in this block, create separate data columns" with pytest.raises(TypeError, match=msg): temp_hdfstore.append("df", df) # directly ndarray msg = "value must be None, Series, or DataFrame" with pytest.raises(TypeError, match=msg): temp_hdfstore.append("df", np.arange(10)) # series directly msg = re.escape( "cannot properly create the storer for: " "[group->df,value->]" ) with pytest.raises(TypeError, match=msg): temp_hdfstore.append("df", Series(np.arange(10))) # appending an incompatible table df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), index=Index([f"i-{i}" for i in range(30)]), ) temp_hdfstore.append("df", df) df["foo"] = "foo" msg = re.escape( "invalid combination of [non_index_axes] on appending data " "[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table " "[(1, ['A', 'B', 'C', 'D'])]" ) with pytest.raises(ValueError, match=msg): temp_hdfstore.append("df", df) # incompatible type (GH 41897) df["foo"] = Timestamp("20130101") temp_hdfstore.remove("df") temp_hdfstore.append("df", df) df["foo"] = "bar" msg = re.escape( "Cannot serialize the column [foo] " "because its data contents are not [string] " "but [datetime64[us]] object dtype" ) with pytest.raises(ValueError, match=msg): temp_hdfstore.append("df", df) def test_append_with_timedelta(temp_hdfstore, unit): # GH 3577 # append timedelta ts = Timestamp("20130101").as_unit("ns") df = DataFrame( { "A": ts, "B": [ts + timedelta(days=i, seconds=10) for i in range(10)], } ) df["C"] = df["A"] - df["B"] df["C"] = df["C"].astype(f"m8[{unit}]") df.loc[3:5, "C"] = np.nan # table temp_hdfstore.append("df", df, data_columns=True) result = temp_hdfstore.select("df") tm.assert_frame_equal(result, df) result = temp_hdfstore.select("df", where="C<100000") tm.assert_frame_equal(result, df) result = temp_hdfstore.select("df", where="C0", "B>0"], selector="df1" ) expected = df[(df.A > 0) & (df.B > 0)] tm.assert_frame_equal(result, expected) def test_append_to_multiple_dropna(temp_hdfstore): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ).rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) # dropna=True should guarantee rows are synchronized temp_hdfstore.append_to_multiple( {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True ) result = temp_hdfstore.select_as_multiple(["df1", "df2"]) expected = df.dropna() tm.assert_frame_equal(result, expected, check_index_type=True) tm.assert_index_equal( temp_hdfstore.select("df1").index, temp_hdfstore.select("df2").index ) def test_append_to_multiple_dropna_false(temp_hdfstore): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) with pd.option_context("io.hdf.dropna_table", True): # dropna=False shouldn't synchronize row indexes temp_hdfstore.append_to_multiple( {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False ) msg = "all tables must have exactly the same nrows!" with pytest.raises(ValueError, match=msg): temp_hdfstore.select_as_multiple(["df1a", "df2a"]) assert not temp_hdfstore.select("df1a").index.equals( temp_hdfstore.select("df2a").index ) def test_append_to_multiple_min_itemsize(temp_hdfstore): # GH 11238 df = DataFrame( { "IX": np.arange(1, 21), "Num": np.arange(1, 21), "BigNum": np.arange(1, 21) * 88, "Str": ["a" for _ in range(20)], "LongStr": ["abcde" for _ in range(20)], } ) expected = df.iloc[[0]] # Reading/writing RangeIndex info is not supported yet expected.index = Index(list(range(len(expected.index)))) temp_hdfstore.append_to_multiple( { "index": ["IX"], "nums": ["Num", "BigNum"], "strs": ["Str", "LongStr"], }, df.iloc[[0]], "index", min_itemsize={"Str": 10, "LongStr": 100, "Num": 2}, ) result = temp_hdfstore.select_as_multiple(["index", "nums", "strs"]) tm.assert_frame_equal(result, expected, check_index_type=True) def test_append_string_nan_rep(temp_hdfstore): # GH 16300 df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10)) df_nan = df.copy() df_nan.loc[0:4, :] = np.nan msg = "NaN representation is too large for existing column size" # string column too small temp_hdfstore.append("sa", df["A"]) with pytest.raises(ValueError, match=msg): temp_hdfstore.append("sa", df_nan["A"]) # nan_rep too big temp_hdfstore.append("sb", df["B"], nan_rep="bars") with pytest.raises(ValueError, match=msg): temp_hdfstore.append("sb", df_nan["B"]) # smaller modified nan_rep temp_hdfstore.append("sc", df["A"], nan_rep="n") temp_hdfstore.append("sc", df_nan["A"]) result = temp_hdfstore["sc"] expected = concat([df["A"], df_nan["A"]]) tm.assert_series_equal(result, expected)