-
Notifications
You must be signed in to change notification settings - Fork 53
Description
Hey,
im new to coding and im trying AutoClean on a dataset but i keep getting this error: TypeError: cannot safely cast non-equivalent float64 to int64.
According to ChatGPT this error typically occurs when you try to convert a floating-point number to an integer using the "int()" function or a similar method, but the float number is not a whole number, which causes a loss of precision.
But it must be possible to use floats as well right?
So im curious why I might get this error. My code is provided below,
Thanks a lot for any help!
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv('/Users/guyjansen/Desktop/Python/Housing Prices Data Science Project/train.csv')
from AutoClean.autoclean import AutoClean
pipeline = AutoClean(df)
pipeline.output
this raises the error:
TypeError Traceback (most recent call last)
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arrays/integer.py in safe_cast(values, dtype, copy)
119 try:
--> 120 return values.astype(dtype, casting="safe", copy=copy)
121 except TypeError as err:
TypeError: Cannot cast array data from dtype('float64') to dtype('int64') according to the rule 'safe'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
/var/folders/wc/2vn5bk3x4hq0b0_hdn9tjzkm0000gn/T/ipykernel_45950/1120075175.py in
1 from AutoClean.autoclean import AutoClean
----> 2 pipeline = AutoClean(df)
3 pipeline.output
~/opt/anaconda3/lib/python3.9/site-packages/AutoClean/autoclean.py in init(self, input_data, mode, duplicates, missing_num, missing_categ, encode_categ, extract_datetime, outliers, outlier_param, logfile, verbose)
80
81 # initialize our class and start the autoclean process
---> 82 self.output = self._clean_data(output_data, input_data)
83
84 end = timer()
~/opt/anaconda3/lib/python3.9/site-packages/AutoClean/autoclean.py in _clean_data(self, df, input_data)
141 df = Duplicates.handle(self, df)
142 df = MissingValues.handle(self, df)
--> 143 df = Outliers.handle(self, df)
144 df = Adjust.convert_datetime(self, df)
145 df = EncodeCateg.handle(self, df)
~/opt/anaconda3/lib/python3.9/site-packages/AutoClean/modules.py in handle(self, df)
272
273 if self.outliers in ['auto', 'winz']:
--> 274 df = Outliers._winsorization(self, df)
275 elif self.outliers == 'delete':
276 df = Outliers._delete(self, df)
~/opt/anaconda3/lib/python3.9/site-packages/AutoClean/modules.py in _winsorization(self, df)
300 else:
301 if (df[feature].fillna(-9999) % 1 == 0).all():
--> 302 df.loc[row_index, feature] = upper_bound
303 df[feature] = df[feature].astype(int)
304 else:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in setitem(self, key, value)
714
715 iloc = self if self.name == "iloc" else self.obj.iloc
--> 716 iloc._setitem_with_indexer(indexer, value, self.name)
717
718 def _validate_key(self, key, axis: int):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value, name)
1689 if take_split_path:
1690 # We have to operate column-wise
-> 1691 self._setitem_with_indexer_split_path(indexer, value, name)
1692 else:
1693 self._setitem_single_block(indexer, value, name)
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _setitem_with_indexer_split_path(self, indexer, value, name)
1782 # scalar value
1783 for loc in ilocs:
-> 1784 self._setitem_single_column(loc, value, pi)
1785
1786 def _setitem_with_indexer_2d_value(self, indexer, value):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _setitem_single_column(self, loc, value, plane_indexer)
1888
1889 orig_values = ser._values
-> 1890 ser._mgr = ser._mgr.setitem((pi,), value)
1891
1892 if ser._values is orig_values:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in setitem(self, indexer, value)
335 For SingleBlockManager, this backs s[indexer] = value
336 """
--> 337 return self.apply("setitem", indexer=indexer, value=value)
338
339 def putmask(self, mask, new, align: bool = True):
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
302 applied = b.apply(f, **kwargs)
303 else:
--> 304 applied = getattr(b, f)(**kwargs)
305 except (TypeError, NotImplementedError):
306 if not ignore_failures:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/blocks.py in setitem(self, indexer, value)
1620
1621 check_setitem_lengths(indexer, value, self.values)
-> 1622 self.values[indexer] = value
1623 return self
1624
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arrays/masked.py in setitem(self, key, value)
222 if _is_scalar:
223 value = [value]
--> 224 value, mask = self._coerce_to_array(value)
225
226 if _is_scalar:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arrays/integer.py in _coerce_to_array(self, value)
334
335 def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray]:
--> 336 return coerce_to_array(value, dtype=self.dtype)
337
338 @overload
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arrays/integer.py in coerce_to_array(values, dtype, mask, copy)
228 values = values.astype(dtype, copy=copy)
229 else:
--> 230 values = safe_cast(values, dtype, copy=False)
231
232 return values, mask
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arrays/integer.py in safe_cast(values, dtype, copy)
124 return casted
125
--> 126 raise TypeError(
127 f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
128 ) from err
TypeError: cannot safely cast non-equivalent float64 to int64