from datetime import ( datetime, timedelta, ) import numpy as np import pytest from pandas.compat import pa_version_under21p0 from pandas import ( NA, DataFrame, Index, MultiIndex, Series, StringDtype, option_context, ) import pandas._testing as tm from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.strings.accessor import StringMethods from pandas.tests.strings import is_object_or_nan_string_dtype @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) def test_startswith_endswith_non_str_patterns(pattern): # GH3485 ser = Series(["foo", "bar"]) msg = f"expected a string or tuple, not {type(pattern).__name__}" with pytest.raises(TypeError, match=msg): ser.str.startswith(pattern) with pytest.raises(TypeError, match=msg): ser.str.endswith(pattern) def test_iter_raises(): # GH 54173 ser = Series(["foo", "bar"]) with pytest.raises(TypeError, match="'StringMethods' object is not iterable"): iter(ser.str) # test integer/float dtypes (inferred by constructor) and mixed def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") if is_object_or_nan_string_dtype(any_string_dtype): expected_dtype = np.float64 item = np.nan else: expected_dtype = "Int64" item = NA expected = Series([1, 2, item, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_count_mixed_object(): ser = Series( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) result = ser.str.count("a") expected = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "pat, expected_data", [ (r"a(?=b)", [0, 1, 0, 0, None]), (r"(?<=a)b", [0, 1, 0, 0, None]), (r"a(?!b)", [2, 0, 1, 0, None]), (r"(?" ) with pytest.raises(UnicodeEncodeError, match=msg): ser.str.encode("cp1252") result = ser.str.encode("cp1252", "ignore") expected = ser.map(lambda x: x.encode("cp1252", "ignore")) tm.assert_series_equal(result, expected) def test_decode_errors_kwarg(): ser = Series([b"a", b"b", b"a\x9d"]) msg = ( "'charmap' codec can't decode byte 0x9d in position 1: " "character maps to " ) with pytest.raises(UnicodeDecodeError, match=msg): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str") tm.assert_series_equal(result, expected) def test_decode_string_dtype(string_dtype): # https://github.com/pandas-dev/pandas/pull/60940 ser = Series([b"a", b"b"]) result = ser.str.decode("utf-8", dtype=string_dtype) expected = Series(["a", "b"], dtype=string_dtype) tm.assert_series_equal(result, expected) def test_decode_object_dtype(object_dtype): # https://github.com/pandas-dev/pandas/pull/60940 ser = Series([b"a", rb"\ud800"]) result = ser.str.decode("utf-8", dtype=object_dtype) expected = Series(["a", r"\ud800"], dtype=object_dtype) tm.assert_series_equal(result, expected) def test_decode_bad_dtype(): # https://github.com/pandas-dev/pandas/pull/60940 ser = Series([b"a", b"b"]) msg = "dtype must be string or object, got dtype='int64'" with pytest.raises(ValueError, match=msg): ser.str.decode("utf-8", dtype="int64") @pytest.mark.parametrize( "form, expected", [ ("NFKC", ["ABC", "ABC", "123", np.nan, "アイエ"]), ("NFC", ["ABC", "ABC", "123", np.nan, "アイエ"]), # noqa: RUF001 ], ) def test_normalize(form, expected, any_string_dtype): ser = Series( ["ABC", "ABC", "123", np.nan, "アイエ"], # noqa: RUF001 index=["a", "b", "c", "d", "e"], dtype=any_string_dtype, ) expected = Series(expected, index=["a", "b", "c", "d", "e"], dtype=any_string_dtype) result = ser.str.normalize(form) tm.assert_series_equal(result, expected) def test_normalize_bad_arg_raises(any_string_dtype): ser = Series( ["ABC", "ABC", "123", np.nan, "アイエ"], # noqa: RUF001 index=["a", "b", "c", "d", "e"], dtype=any_string_dtype, ) with pytest.raises(ValueError, match="invalid normalization form"): ser.str.normalize("xxx") def test_normalize_index(): idx = Index(["ABC", "123", "アイエ"]) # noqa: RUF001 expected = Index(["ABC", "123", "アイエ"]) result = idx.str.normalize("NFKC") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "values,inferred_type", [ (["a", "b"], "string"), (["a", "b", 1], "mixed-integer"), (["a", "b", 1.3], "mixed"), (["a", "b", 1.3, 1], "mixed-integer"), (["aa", datetime(2011, 1, 1)], "mixed"), ], ) def test_index_str_accessor_visibility(values, inferred_type, index_or_series): obj = index_or_series(values) if index_or_series is Index: assert obj.inferred_type == inferred_type assert isinstance(obj.str, StringMethods) @pytest.mark.parametrize( "values,inferred_type", [ ([1, np.nan], "floating"), ([datetime(2011, 1, 1)], "datetime64"), ([timedelta(1)], "timedelta64"), ], ) def test_index_str_accessor_non_string_values_raises( values, inferred_type, index_or_series ): obj = index_or_series(values) if index_or_series is Index: assert obj.inferred_type == inferred_type msg = "Can only use .str accessor with string values" with pytest.raises(AttributeError, match=msg): obj.str def test_index_str_accessor_multiindex_raises(): # MultiIndex has mixed dtype, but not allow to use accessor idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")]) assert idx.inferred_type == "mixed" msg = "Can only use .str accessor with Index, not MultiIndex" with pytest.raises(AttributeError, match=msg): idx.str def test_str_accessor_no_new_attributes(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/10673 ser = Series(list("aabbcde"), dtype=any_string_dtype) with pytest.raises(AttributeError, match="You cannot add any new attribute"): ser.str.xlabel = "a" def test_cat_on_bytes_raises(): lhs = Series(np.array(list("abc"), "S1").astype(object)) rhs = Series(np.array(list("def"), "S1").astype(object)) msg = "Cannot use .str.cat with values of inferred dtype 'bytes'" with pytest.raises(TypeError, match=msg): lhs.str.cat(rhs) def test_str_accessor_in_apply_func(): # https://github.com/pandas-dev/pandas/issues/38979 df = DataFrame(zip("abc", "def", strict=True)) expected = Series(["A/D", "B/E", "C/F"]) result = df.apply(lambda f: "/".join(f.str.upper()), axis=1) tm.assert_series_equal(result, expected) def test_zfill(): # https://github.com/pandas-dev/pandas/issues/20868 value = Series(["-1", "1", "1000", 10, np.nan]) expected = Series(["-01", "001", "1000", np.nan, np.nan], dtype=object) tm.assert_series_equal(value.str.zfill(3), expected) value = Series(["-2", "+5"]) expected = Series(["-0002", "+0005"]) tm.assert_series_equal(value.str.zfill(5), expected) def test_zfill_with_non_integer_argument(): value = Series(["-2", "+5"]) wid = "a" msg = f"width must be of integer type, not {type(wid).__name__}" with pytest.raises(TypeError, match=msg): value.str.zfill(wid) def test_zfill_with_leading_sign(): value = Series(["-cat", "-1", "+dog"]) expected = Series(["-0cat", "-0001", "+0dog"]) tm.assert_series_equal(value.str.zfill(5), expected) def test_get_with_dict_label(): # GH47911 s = Series( [ {"name": "Hello", "value": "World"}, {"name": "Goodbye", "value": "Planet"}, {"value": "Sea"}, ] ) result = s.str.get("name") expected = Series(["Hello", "Goodbye", None], dtype=object) tm.assert_series_equal(result, expected) result = s.str.get("value") expected = Series(["World", "Planet", "Sea"], dtype=object) tm.assert_series_equal(result, expected) def test_series_str_decode(): # GH 22613 result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict") expected = Series(["x", "y"], dtype="str") tm.assert_series_equal(result, expected) def test_decode_with_dtype_none(): with option_context("future.infer_string", True): ser = Series([b"a", b"b", b"c"]) result = ser.str.decode("utf-8", dtype=None) expected = Series(["a", "b", "c"], dtype="str") tm.assert_series_equal(result, expected) def test_setitem_with_different_string_storage(): # GH#52987 # Test setitem with values from different string storage type pytest.importorskip("pyarrow") # Test Series[string[python]].__setitem__(Series[string[pyarrow]]) ser_python = Series(range(5), dtype="string[python]") ser_pyarrow = ser_python.astype("string[pyarrow]") ser_python[:2] = ser_pyarrow[:2] expected = Series(["0", "1", "2", "3", "4"], dtype="string[python]") tm.assert_series_equal(ser_python, expected) # Test Series[string[pyarrow]].__setitem__(Series[string[python]]) ser_pyarrow = Series(range(5), dtype="string[pyarrow]") ser_python = ser_pyarrow.astype("string[python]") ser_pyarrow[:2] = ser_python[:2] expected = Series(["0", "1", "2", "3", "4"], dtype="string[pyarrow]") tm.assert_series_equal(ser_pyarrow, expected) # Test with slice and missing values ser_python = Series(["a", "b", None, "d", "e"], dtype="string[python]") ser_pyarrow = Series(["X", "Y", None], dtype="string[pyarrow]") ser_python[1:4] = ser_pyarrow expected = Series(["a", "X", "Y", NA, "e"], dtype="string[python]") tm.assert_series_equal(ser_python, expected) @pytest.mark.parametrize( "pat, expected", [ # lookaround assertions (r"(?=abc)", True), (r"(?<=123)", True), (r"(?!xyz)", True), (r"(?\w+)\s+(?P=word)\b", True), ], ) def test_has_regex_unsupported_code(pat, expected): # https://github.com/pandas-dev/pandas/issues/60833 assert ArrowStringArrayMixin._has_unsupported_regex(pat) == expected