from datetime import datetime import re import numpy as np import pytest from pandas._libs import lib import pandas.util._test_decorators as td import pandas as pd from pandas import ( Series, StringDtype, _testing as tm, ) from pandas.tests.strings import ( _convert_na_value, is_object_or_nan_string_dtype, ) # -------------------------------------------------------------------------------------- # str.contains # -------------------------------------------------------------------------------------- def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ ) values = Series(values, dtype=any_string_dtype) pat = "mmm[_]+" result = values.str.contains(pat) if any_string_dtype == "str": # NaN propagates as False expected = Series([False, False, True, True, False], dtype=bool) else: expected_dtype = ( "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) expected = Series( np.array([False, np.nan, True, True, False], dtype=np.object_), dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, regex=False) if any_string_dtype == "str": expected = Series([False, False, False, False, True], dtype=bool) else: expected = Series( np.array([False, np.nan, False, False, True], dtype=np.object_), dtype=expected_dtype, ) tm.assert_series_equal(result, expected) values = Series( np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object), dtype=any_string_dtype, ) result = values.str.contains(pat) expected_dtype = ( np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) # case insensitive using regex values = Series( np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object), dtype=any_string_dtype, ) result = values.str.contains("FOO|mmm", case=False) expected = Series(np.array([True, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) # case insensitive without regex result = values.str.contains("foo", regex=False, case=False) expected = Series(np.array([True, False, True, False]), dtype=expected_dtype) tm.assert_series_equal(result, expected) # unicode values = Series( np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_), dtype=any_string_dtype, ) pat = "mmm[_]+" result = values.str.contains(pat) if any_string_dtype == "str": expected = Series([False, False, True, True], dtype=bool) else: expected_dtype = ( "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) expected_dtype = ( np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_), dtype=any_string_dtype, ) result = values.str.contains(pat) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_contains_object_mixed(): mixed = Series( np.array( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) ) result = mixed.str.contains("o") expected = Series( np.array( [False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan], dtype=np.object_, ) ) tm.assert_series_equal(result, expected) def test_contains_na_kwarg_for_object_category(): # gh 22158 # na for category values = Series(["a", "b", "c", "a", np.nan], dtype="category") result = values.str.contains("a", na=True) expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) result = values.str.contains("a", na=False) expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) # na for objects values = Series(["a", "b", "c", "a", np.nan]) result = values.str.contains("a", na=True) expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) result = values.str.contains("a", na=False) expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "na, expected", [ (None, pd.NA), (True, True), (False, False), (0, False), (3, True), (np.nan, pd.NA), ], ) @pytest.mark.parametrize("regex", [True, False]) def test_contains_na_kwarg_for_nullable_string_dtype( nullable_string_dtype, na, expected, regex ): # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) if na in [0, 3] and na is not False: msg = f"na must be None, pd.NA, np.nan, True, or False; got {na}" with pytest.raises(ValueError, match=msg): values.str.contains("a", na=na, regex=regex) else: result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") tm.assert_series_equal(result, expected) def test_contains_moar(any_string_dtype): # PR #1179 s = Series( ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], dtype=any_string_dtype, ) result = s.str.contains("a") if any_string_dtype == "str": # NaN propagates as False expected_dtype = bool na_value = False else: expected_dtype = ( "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) na_value = np.nan expected = Series( [False, False, False, True, True, False, na_value, False, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("a", case=False) expected = Series( [True, False, False, True, True, False, na_value, True, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("Aa") expected = Series( [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba") expected = Series( [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba", case=False) expected = Series( [False, False, False, True, True, False, na_value, True, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) def test_contains_nan(any_string_dtype): # PR #14171 s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) expected_dtype = ( np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.contains("foo", na=True) expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) msg = "na must be None, pd.NA, np.nan, True, or False; got foo" with pytest.raises(ValueError, match=msg): s.str.contains("foo", na="foo") result = s.str.contains("foo") if any_string_dtype == "str": # NaN propagates as False expected = Series([False, False, False], dtype=bool) else: expected_dtype = ( "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_contains_compiled_regex(any_string_dtype): # GH#61942 expected_dtype = ( np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) ser = Series(["foo", "bar", "Baz"], dtype=any_string_dtype) pat = re.compile("ba.") result = ser.str.contains(pat) expected = Series([False, True, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) # TODO this currently works for pyarrow-backed dtypes but raises for python if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow": result = ser.str.contains(pat, case=False) expected = Series([False, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) else: with pytest.raises( ValueError, match="cannot process flags argument with a compiled pattern" ): ser.str.contains(pat, case=False) pat = re.compile("ba.", flags=re.IGNORECASE) result = ser.str.contains(pat) expected = Series([False, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) # TODO should this be supported? with pytest.raises( ValueError, match="cannot process flags argument with a compiled pattern" ): ser.str.contains(pat, flags=re.IGNORECASE) def test_contains_compiled_regex_flags(any_string_dtype): # ensure other (than ignorecase) flags are respected expected_dtype = ( np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) ser = Series(["foobar", "foo\nbar", "Baz"], dtype=any_string_dtype) pat = re.compile("^ba") result = ser.str.contains(pat) expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) pat = re.compile("^ba", flags=re.MULTILINE) result = ser.str.contains(pat) expected = Series([False, True, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) pat = re.compile("^ba", flags=re.MULTILINE | re.IGNORECASE) result = ser.str.contains(pat) expected = Series([False, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "pat, expected_data", [ (r"a(?=b)", [False, True, False, False]), (r"(?<=a)b", [False, True, False, False]), (r"a(?!b)", [True, False, True, False]), (r"(? \g \g", ["Three Two One", "Baz Bar Foo"], ), ( r"\3 \2 \1", ["Three Two One", "Baz Bar Foo"], ), ( r"\g<3> \g<2> \g<1>", ["Three Two One", "Baz Bar Foo"], ), ( r"\g<2>0", ["Two0", "Bar0"], ), ( r"\g<2>0 \1", ["Two0 One", "Bar0 Foo"], ), ], ids=[ "named_groups_full_swap", "numbered_groups_no_g_full_swap", "numbered_groups_full_swap", "single_group_with_literal", "mixed_group_reference_with_literal", ], ) @pytest.mark.parametrize("use_compile", [True, False]) def test_replace_named_groups_regex_swap( any_string_dtype, use_compile, repl, expected_list ): # GH#57636 ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype) pattern = r"(?P\w+) (?P\w+) (?P\w+)" if use_compile: pattern = re.compile(pattern) result = ser.str.replace(pattern, repl, regex=True) expected = Series(expected_list, dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "repl", [ r"\g<20>", r"\20", r"\40", r"\4", ], ) @pytest.mark.parametrize("use_compile", [True, False]) def test_replace_named_groups_regex_swap_expected_fail( any_string_dtype, repl, use_compile, request ): # GH#57636 if ( not use_compile and r"\g" not in repl and isinstance(any_string_dtype, StringDtype) and any_string_dtype.storage == "pyarrow" ): # calls pyarrow method directly if repl == r"\20": mark = pytest.mark.xfail(reason="PyArrow interprets as group + literal") request.applymarker(mark) pa = pytest.importorskip("pyarrow") error_type = pa.ArrowInvalid error_msg = r"only has \d parenthesized subexpressions" else: error_type = re.error error_msg = "invalid group reference" pattern = r"(?P\w+) (?P\w+) (?P\w+)" if use_compile: pattern = re.compile(pattern) ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype) with pytest.raises(error_type, match=error_msg): ser.str.replace(pattern, repl, regex=True) @pytest.mark.parametrize( "pattern, repl", [ (r"(\w+) (\w+) (\w+)", r"\20"), (r"(?P\w+) (?P\w+) (?P\w+)", r"\20"), ], ) def test_pyarrow_ambiguous_group_references(pyarrow_string_dtype, pattern, repl): # GH#62653 ser = Series(["One Two Three", "Foo Bar Baz"], dtype=pyarrow_string_dtype) result = ser.str.replace(pattern, repl, regex=True) expected = Series(["Two0", "Bar0"], dtype=pyarrow_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "pattern, repl, expected_list", [ ( r"\[(?P\d+)\]", r"(\1)", ["var.one(0)", "var.two(1)", "var.three(2)"], ), ( r"\[(\d+)\]", r"(\1)", ["var.one(0)", "var.two(1)", "var.three(2)"], ), ], ) @td.skip_if_no("pyarrow") def test_pyarrow_backend_group_replacement(pattern, repl, expected_list): ser = Series(["var.one[0]", "var.two[1]", "var.three[2]"]).convert_dtypes( dtype_backend="pyarrow" ) result = ser.str.replace(pattern, repl, regex=True) expected = Series(expected_list).convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) def test_replace_callable_named_groups(any_string_dtype): # test regex named groups ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_replace_compiled_regex(any_string_dtype): # GH 15446 ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with compiled regex pat = re.compile(r"BAD_*") result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_replace_compiled_regex_mixed_object(): pat = re.compile(r"BAD_*") ser = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace(pat, "", regex=True) expected = Series( ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object ) tm.assert_series_equal(result, expected) def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) def test_replace_compiled_regex_raises(any_string_dtype): # case and flags provided to str.replace will have no effect # and will produce warnings ser = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype) pat = re.compile(r"BAD_*") msg = "case and flags cannot be set when pat is a compiled regex" with pytest.raises(ValueError, match=msg): ser.str.replace(pat, "", flags=re.IGNORECASE, regex=True) with pytest.raises(ValueError, match=msg): ser.str.replace(pat, "", case=False, regex=True) with pytest.raises(ValueError, match=msg): ser.str.replace(pat, "", case=True, regex=True) def test_replace_compiled_regex_callable(any_string_dtype): # test with callable ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("regex,expected_val", [(True, "bao"), (False, "foo")]) def test_replace_literal(regex, expected_val, any_string_dtype): # GH16808 literal replace (regex=False vs regex=True) ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) expected = Series(["bao", expected_val, np.nan], dtype=any_string_dtype) result = ser.str.replace("f.", "ba", regex=regex) tm.assert_series_equal(result, expected) def test_replace_literal_callable_raises(any_string_dtype): ser = Series([], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() msg = "Cannot use a callable replacement when regex=False" with pytest.raises(ValueError, match=msg): ser.str.replace("abc", repl, regex=False) def test_replace_literal_compiled_raises(any_string_dtype): ser = Series([], dtype=any_string_dtype) pat = re.compile("[a-z][A-Z]{2}") msg = "Cannot use a compiled regex as replacement pattern with regex=False" with pytest.raises(ValueError, match=msg): ser.str.replace(pat, "", regex=False) def test_replace_moar(any_string_dtype): # PR #1179 ser = Series( ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], dtype=any_string_dtype, ) result = ser.str.replace("A", "YYY") expected = Series( ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"], dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) result = ser.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", "", np.nan, "CYYYBYYY", "dog", "cYYYt", ], dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A", "B", "C", "XX-XX ba", "XX-XX ca", "", np.nan, "XX-XX BA", "XX-XX ", "XX-XX t", ], dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_replace_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/pull/24809 s = Series(["a", "b", "ac", np.nan, ""], dtype=any_string_dtype) result = s.str.replace("^.$", "a", regex=True) expected = Series(["a", "a", "ac", np.nan, ""], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("regex", [True, False]) def test_replace_regex_single_character(regex, any_string_dtype): # https://github.com/pandas-dev/pandas/pull/24809, enforced in 2.0 # GH 24804 s = Series(["a.b", ".", "b", np.nan, ""], dtype=any_string_dtype) result = s.str.replace(".", "a", regex=regex) if regex: expected = Series(["aaa", "a", "a", np.nan, ""], dtype=any_string_dtype) else: expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "pat, expected_data", [ (r"a(?=b)", ["aa", "xb", "ba", "bb"]), (r"(?<=a)b", ["aa", "ax", "ba", "bb"]), (r"a(?!b)", ["xx", "ab", "bx", "bb"]), (r"(?