Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support axis=1 for DataFrame.dropna(). #1689

Merged
merged 3 commits into from
Aug 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 87 additions & 14 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4554,6 +4554,14 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False):
name toy born
1 Batman Batmobile 1940-04-25

Drop the columns where at least one element is missing.

>>> df.dropna(axis='columns')
name
0 Alfred
1 Batman
2 Catwoman

Drop the rows where all elements are missing.

>>> df.dropna(how='all')
Expand Down Expand Up @@ -4584,14 +4592,25 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False):
"""
axis = validate_axis(axis)
inplace = validate_bool_kwarg(inplace, "inplace")

if thresh is None:
if how is None:
raise TypeError("must specify how or thresh")
elif how not in ("any", "all"):
raise ValueError("invalid how option: {h}".format(h=how))

if subset is not None:
if isinstance(subset, str):
labels = [(subset,)]
elif isinstance(subset, tuple):
labels = [subset]
else:
labels = [sub if isinstance(sub, tuple) else (sub,) for sub in subset]
else:
labels = None

if axis == 0:
if subset is not None:
if isinstance(subset, str):
labels = [(subset,)]
elif isinstance(subset, tuple):
labels = [subset]
else:
labels = [sub if isinstance(sub, tuple) else (sub,) for sub in subset]
if labels is not None:
invalids = [label for label in labels if label not in self._internal.column_labels]
if len(invalids) > 0:
raise KeyError(invalids)
Expand All @@ -4612,20 +4631,74 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False):
pred = cnt == F.lit(len(labels))
elif how == "all":
pred = cnt > F.lit(0)
else:
if how is not None:
raise ValueError("invalid how option: {h}".format(h=how))
else:
raise TypeError("must specify how or thresh")

internal = self._internal.with_filter(pred)
if inplace:
self._update_internal_frame(internal)
else:
return DataFrame(internal)

else:
raise NotImplementedError("dropna currently only works for axis=0 or axis='index'")
assert axis == 1

internal = self._internal.resolved_copy

if labels is not None:
if any(len(lbl) != len(internal.index_map) for lbl in labels):
raise ValueError(
"The length of each subset must be the same as the index size."
)

cond = reduce(
lambda x, y: x | y,
[
reduce(
lambda x, y: x & y,
[
scol == F.lit(l)
for l, scol in zip(lbl, internal.index_spark_columns)
],
)
for lbl in labels
],
)

internal = internal.with_filter(cond)

null_counts = []
for label in internal.column_labels:
scol = internal.spark_column_for(label)
if isinstance(internal.spark_type_for(label), (FloatType, DoubleType)):
cond = scol.isNull() | F.isnan(scol)
else:
cond = scol.isNull()
null_counts.append(
F.sum(F.when(~cond, 1).otherwise(0)).alias(name_like_string(label))
)

counts = internal.spark_frame.select(null_counts + [F.count("*")]).head()

if thresh is not None:
column_labels = [
label
for label, cnt in zip(internal.column_labels, counts)
if (cnt or 0) >= int(thresh)
]
elif how == "any":
column_labels = [
label
for label, cnt in zip(internal.column_labels, counts)
if (cnt or 0) == counts[-1]
]
elif how == "all":
column_labels = [
label for label, cnt in zip(internal.column_labels, counts) if (cnt or 0) > 0
]

kdf = self[column_labels]
if inplace:
self._update_internal_frame(kdf._internal)
else:
return kdf

# TODO: add 'limit' when value parameter exists
def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None):
Expand Down
130 changes: 86 additions & 44 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,45 +634,89 @@ def test_drop(self):
self.assertRaises(KeyError, lambda: kdf.drop(columns="c"))
self.assertRaises(KeyError, lambda: kdf.drop(columns=("a", "z")))

def test_dropna(self):
pdf = pd.DataFrame(
{
"x": [np.nan, 2, 3, 4, np.nan, 6],
"y": [1, 2, np.nan, 4, np.nan, np.nan],
"z": [1, 2, 3, 4, np.nan, np.nan],
},
index=np.random.rand(6),
)
def _test_dropna(self, pdf, axis):
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.dropna(), pdf.dropna())
self.assert_eq(kdf.dropna(how="all"), pdf.dropna(how="all"))
self.assert_eq(kdf.dropna(subset=["x"]), pdf.dropna(subset=["x"]))
self.assert_eq(kdf.dropna(subset="x"), pdf.dropna(subset=["x"]))
self.assert_eq(kdf.dropna(subset=["y", "z"]), pdf.dropna(subset=["y", "z"]))
self.assert_eq(kdf.dropna(axis=axis), pdf.dropna(axis=axis))
self.assert_eq(kdf.dropna(axis=axis, how="all"), pdf.dropna(axis=axis, how="all"))
self.assert_eq(kdf.dropna(axis=axis, subset=["x"]), pdf.dropna(axis=axis, subset=["x"]))
self.assert_eq(kdf.dropna(axis=axis, subset="x"), pdf.dropna(axis=axis, subset=["x"]))
self.assert_eq(
kdf.dropna(axis=axis, subset=["y", "z"]), pdf.dropna(axis=axis, subset=["y", "z"])
)
self.assert_eq(
kdf.dropna(subset=["y", "z"], how="all"), pdf.dropna(subset=["y", "z"], how="all")
kdf.dropna(axis=axis, subset=["y", "z"], how="all"),
pdf.dropna(axis=axis, subset=["y", "z"], how="all"),
)

self.assert_eq(kdf.dropna(thresh=2), pdf.dropna(thresh=2))
self.assert_eq(kdf.dropna(axis=axis, thresh=2), pdf.dropna(axis=axis, thresh=2))
self.assert_eq(
kdf.dropna(thresh=1, subset=["y", "z"]), pdf.dropna(thresh=1, subset=["y", "z"])
kdf.dropna(axis=axis, thresh=1, subset=["y", "z"]),
pdf.dropna(axis=axis, thresh=1, subset=["y", "z"]),
)

pdf2 = pdf.copy()
kdf2 = kdf.copy()
pser = pdf2.x
kser = kdf2.x
pser = pdf2[pdf2.columns[0]]
kser = kdf2[kdf2.columns[0]]
pdf2.dropna(inplace=True)
kdf2.dropna(inplace=True)
self.assert_eq(kdf2, pdf2)
self.assert_eq(kser, pser, almost=True)

msg = "dropna currently only works for axis=0 or axis='index'"
with self.assertRaisesRegex(NotImplementedError, msg):
kdf.dropna(axis=1)
with self.assertRaisesRegex(NotImplementedError, msg):
kdf.dropna(axis="columns")
# multi-index
columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
if axis == 0:
pdf.columns = columns
else:
pdf.index = columns
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.dropna(axis=axis), pdf.dropna(axis=axis))
self.assert_eq(kdf.dropna(axis=axis, how="all"), pdf.dropna(axis=axis, how="all"))
self.assert_eq(
kdf.dropna(axis=axis, subset=[("a", "x")]), pdf.dropna(axis=axis, subset=[("a", "x")])
)
self.assert_eq(
kdf.dropna(axis=axis, subset=("a", "x")), pdf.dropna(axis=axis, subset=[("a", "x")])
)
self.assert_eq(
kdf.dropna(axis=axis, subset=[("a", "y"), ("b", "z")]),
pdf.dropna(axis=axis, subset=[("a", "y"), ("b", "z")]),
)
self.assert_eq(
kdf.dropna(axis=axis, subset=[("a", "y"), ("b", "z")], how="all"),
pdf.dropna(axis=axis, subset=[("a", "y"), ("b", "z")], how="all"),
)

self.assert_eq(kdf.dropna(axis=axis, thresh=2), pdf.dropna(axis=axis, thresh=2))
self.assert_eq(
kdf.dropna(axis=axis, thresh=1, subset=[("a", "y"), ("b", "z")]),
pdf.dropna(axis=axis, thresh=1, subset=[("a", "y"), ("b", "z")]),
)

def test_dropna_axis_index(self):
pdf = pd.DataFrame(
{
"x": [np.nan, 2, 3, 4, np.nan, 6],
"y": [1, 2, np.nan, 4, np.nan, np.nan],
"z": [1, 2, 3, 4, np.nan, np.nan],
},
index=np.random.rand(6),
)
kdf = ks.from_pandas(pdf)

self._test_dropna(pdf, axis=0)

# empty
pdf = pd.DataFrame(index=np.random.rand(6))
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.dropna(), pdf.dropna())
self.assert_eq(kdf.dropna(how="all"), pdf.dropna(how="all"))
self.assert_eq(kdf.dropna(thresh=0), pdf.dropna(thresh=0))
self.assert_eq(kdf.dropna(thresh=1), pdf.dropna(thresh=1))

with self.assertRaisesRegex(ValueError, "No axis named foo"):
kdf.dropna(axis="foo")

Expand All @@ -682,28 +726,26 @@ def test_dropna(self):
with self.assertRaisesRegex(TypeError, "must specify how or thresh"):
kdf.dropna(how=None)

# multi-index columns
columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
pdf.columns = columns
kdf.columns = columns
def test_dropna_axis_column(self):
pdf = pd.DataFrame(
{
"x": [np.nan, 2, 3, 4, np.nan, 6],
"y": [1, 2, np.nan, 4, np.nan, np.nan],
"z": [1, 2, 3, 4, np.nan, np.nan],
},
index=[str(r) for r in np.random.rand(6)],
).T

self.assert_eq(kdf.dropna(), pdf.dropna())
self.assert_eq(kdf.dropna(how="all"), pdf.dropna(how="all"))
self.assert_eq(kdf.dropna(subset=[("a", "x")]), pdf.dropna(subset=[("a", "x")]))
self.assert_eq(kdf.dropna(subset=("a", "x")), pdf.dropna(subset=[("a", "x")]))
self.assert_eq(
kdf.dropna(subset=[("a", "y"), ("b", "z")]), pdf.dropna(subset=[("a", "y"), ("b", "z")])
)
self.assert_eq(
kdf.dropna(subset=[("a", "y"), ("b", "z")], how="all"),
pdf.dropna(subset=[("a", "y"), ("b", "z")], how="all"),
)
self._test_dropna(pdf, axis=1)

self.assert_eq(kdf.dropna(thresh=2), pdf.dropna(thresh=2))
self.assert_eq(
kdf.dropna(thresh=1, subset=[("a", "y"), ("b", "z")]),
pdf.dropna(thresh=1, subset=[("a", "y"), ("b", "z")]),
)
# empty
pdf = pd.DataFrame({"x": [], "y": [], "z": []})
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.dropna(axis=1), pdf.dropna(axis=1))
self.assert_eq(kdf.dropna(axis=1, how="all"), pdf.dropna(axis=1, how="all"))
self.assert_eq(kdf.dropna(axis=1, thresh=0), pdf.dropna(axis=1, thresh=0))
self.assert_eq(kdf.dropna(axis=1, thresh=1), pdf.dropna(axis=1, thresh=1))

def test_dtype(self):
pdf = pd.DataFrame(
Expand Down