|
6 | 6 |
|
7 | 7 | from ._base import _BaseImputer |
8 | 8 | from ..utils.validation import FLOAT_DTYPES |
9 | | -from ..metrics import pairwise_distances |
| 9 | +from ..metrics import pairwise_distances_chunked |
10 | 10 | from ..metrics.pairwise import _NAN_METRICS |
11 | 11 | from ..neighbors._base import _get_weights |
12 | 12 | from ..neighbors._base import _check_weights |
@@ -217,71 +217,81 @@ def transform(self, X): |
217 | 217 |
|
218 | 218 | mask = _get_mask(X, self.missing_values) |
219 | 219 | mask_fit_X = self._mask_fit_X |
| 220 | + valid_mask = ~np.all(mask_fit_X, axis=0) |
220 | 221 |
|
221 | | - # Removes columns where the training data is all nan |
222 | 222 | if not np.any(mask): |
223 | | - valid_mask = ~np.all(mask_fit_X, axis=0) |
| 223 | + # No missing values in X |
| 224 | + # Remove columns where the training data is all nan |
224 | 225 | return X[:, valid_mask] |
225 | 226 |
|
226 | 227 | row_missing_idx = np.flatnonzero(mask.any(axis=1)) |
227 | 228 |
|
228 | | - # Pairwise distances between receivers and fitted samples |
229 | | - dist = pairwise_distances(X[row_missing_idx, :], self._fit_X, |
230 | | - metric=self.metric, |
231 | | - missing_values=self.missing_values, |
232 | | - force_all_finite=force_all_finite) |
| 229 | + non_missing_fix_X = np.logical_not(mask_fit_X) |
233 | 230 |
|
234 | 231 | # Maps from indices from X to indices in dist matrix |
235 | 232 | dist_idx_map = np.zeros(X.shape[0], dtype=np.int) |
236 | 233 | dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0]) |
237 | 234 |
|
238 | | - non_missing_fix_X = np.logical_not(mask_fit_X) |
239 | | - |
240 | | - # Find and impute missing |
241 | | - valid_idx = [] |
242 | | - for col in range(X.shape[1]): |
243 | | - |
244 | | - potential_donors_idx = np.flatnonzero(non_missing_fix_X[:, col]) |
245 | | - |
246 | | - # column was all missing during training |
247 | | - if len(potential_donors_idx) == 0: |
248 | | - continue |
249 | | - |
250 | | - # column has no missing values |
251 | | - if not np.any(mask[:, col]): |
252 | | - valid_idx.append(col) |
253 | | - continue |
| 235 | + def process_chunk(dist_chunk, start): |
| 236 | + row_missing_chunk = row_missing_idx[start:start + len(dist_chunk)] |
254 | 237 |
|
255 | | - valid_idx.append(col) |
256 | | - |
257 | | - receivers_idx = np.flatnonzero(mask[:, col]) |
258 | | - |
259 | | - # distances for samples that needed imputation for column |
260 | | - dist_subset = (dist[dist_idx_map[receivers_idx]] |
261 | | - [:, potential_donors_idx]) |
| 238 | + # Find and impute missing by column |
| 239 | + for col in range(X.shape[1]): |
| 240 | + if not valid_mask[col]: |
| 241 | + # column was all missing during training |
| 242 | + continue |
262 | 243 |
|
263 | | - # receivers with all nan distances impute with mean |
264 | | - all_nan_dist_mask = np.isnan(dist_subset).all(axis=1) |
265 | | - all_nan_receivers_idx = receivers_idx[all_nan_dist_mask] |
| 244 | + col_mask = mask[row_missing_chunk, col] |
| 245 | + if not np.any(col_mask): |
| 246 | + # column has no missing values |
| 247 | + continue |
266 | 248 |
|
267 | | - if all_nan_receivers_idx.size: |
268 | | - col_mean = np.ma.array(self._fit_X[:, col], |
269 | | - mask=mask_fit_X[:, col]).mean() |
270 | | - X[all_nan_receivers_idx, col] = col_mean |
| 249 | + potential_donors_idx, = np.nonzero(non_missing_fix_X[:, col]) |
271 | 250 |
|
272 | | - if len(all_nan_receivers_idx) == len(receivers_idx): |
273 | | - # all receivers imputed with mean |
274 | | - continue |
| 251 | + # receivers_idx are indices in X |
| 252 | + receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)] |
275 | 253 |
|
276 | | - # receivers with at least one defined distance |
277 | | - receivers_idx = receivers_idx[~all_nan_dist_mask] |
278 | | - dist_subset = (dist[dist_idx_map[receivers_idx]] |
| 254 | + # distances for samples that needed imputation for column |
| 255 | + dist_subset = (dist_chunk[dist_idx_map[receivers_idx] - start] |
279 | 256 | [:, potential_donors_idx]) |
280 | 257 |
|
281 | | - n_neighbors = min(self.n_neighbors, len(potential_donors_idx)) |
282 | | - value = self._calc_impute(dist_subset, n_neighbors, |
283 | | - self._fit_X[potential_donors_idx, col], |
284 | | - mask_fit_X[potential_donors_idx, col]) |
285 | | - X[receivers_idx, col] = value |
286 | | - |
287 | | - return super()._concatenate_indicator(X[:, valid_idx], X_indicator) |
| 258 | + # receivers with all nan distances impute with mean |
| 259 | + all_nan_dist_mask = np.isnan(dist_subset).all(axis=1) |
| 260 | + all_nan_receivers_idx = receivers_idx[all_nan_dist_mask] |
| 261 | + |
| 262 | + if all_nan_receivers_idx.size: |
| 263 | + col_mean = np.ma.array(self._fit_X[:, col], |
| 264 | + mask=mask_fit_X[:, col]).mean() |
| 265 | + X[all_nan_receivers_idx, col] = col_mean |
| 266 | + |
| 267 | + if len(all_nan_receivers_idx) == len(receivers_idx): |
| 268 | + # all receivers imputed with mean |
| 269 | + continue |
| 270 | + |
| 271 | + # receivers with at least one defined distance |
| 272 | + receivers_idx = receivers_idx[~all_nan_dist_mask] |
| 273 | + dist_subset = (dist_chunk[dist_idx_map[receivers_idx] |
| 274 | + - start] |
| 275 | + [:, potential_donors_idx]) |
| 276 | + |
| 277 | + n_neighbors = min(self.n_neighbors, len(potential_donors_idx)) |
| 278 | + value = self._calc_impute( |
| 279 | + dist_subset, |
| 280 | + n_neighbors, |
| 281 | + self._fit_X[potential_donors_idx, col], |
| 282 | + mask_fit_X[potential_donors_idx, col]) |
| 283 | + X[receivers_idx, col] = value |
| 284 | + |
| 285 | + # process in fixed-memory chunks |
| 286 | + gen = pairwise_distances_chunked( |
| 287 | + X[row_missing_idx, :], |
| 288 | + self._fit_X, |
| 289 | + metric=self.metric, |
| 290 | + missing_values=self.missing_values, |
| 291 | + force_all_finite=force_all_finite, |
| 292 | + reduce_func=process_chunk) |
| 293 | + for chunk in gen: |
| 294 | + # process_chunk modifies X in place. No return value. |
| 295 | + pass |
| 296 | + |
| 297 | + return super()._concatenate_indicator(X[:, valid_mask], X_indicator) |
0 commit comments