1import itertools 2from typing import List, Optional, Union 3 4import numpy as np 5 6import pandas._libs.algos as libalgos 7import pandas._libs.reshape as libreshape 8from pandas._libs.sparse import IntIndex 9from pandas.util._decorators import cache_readonly 10 11from pandas.core.dtypes.cast import maybe_promote 12from pandas.core.dtypes.common import ( 13 ensure_platform_int, 14 is_bool_dtype, 15 is_extension_array_dtype, 16 is_integer, 17 is_integer_dtype, 18 is_list_like, 19 is_object_dtype, 20 needs_i8_conversion, 21) 22from pandas.core.dtypes.missing import notna 23 24import pandas.core.algorithms as algos 25from pandas.core.arrays import SparseArray 26from pandas.core.arrays.categorical import factorize_from_iterable 27from pandas.core.frame import DataFrame 28from pandas.core.indexes.api import Index, MultiIndex 29from pandas.core.series import Series 30from pandas.core.sorting import ( 31 compress_group_index, 32 decons_obs_group_ids, 33 get_compressed_ids, 34 get_group_index, 35) 36 37 38class _Unstacker: 39 """ 40 Helper class to unstack data / pivot with multi-level index 41 42 Parameters 43 ---------- 44 index : MultiIndex 45 level : int or str, default last level 46 Level to "unstack". Accepts a name for the level. 47 fill_value : scalar, optional 48 Default value to fill in missing values if subgroups do not have the 49 same set of labels. By default, missing values will be replaced with 50 the default fill value for that data type, NaN for float, NaT for 51 datetimelike, etc. For integer types, by default data will converted to 52 float and missing values will be set to NaN. 53 constructor : object 54 Pandas ``DataFrame`` or subclass used to create unstacked 55 response. If None, DataFrame will be used. 56 57 Examples 58 -------- 59 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), 60 ... ('two', 'a'), ('two', 'b')]) 61 >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) 62 >>> s 63 one a 1 64 b 2 65 two a 3 66 b 4 67 dtype: int64 68 69 >>> s.unstack(level=-1) 70 a b 71 one 1 2 72 two 3 4 73 74 >>> s.unstack(level=0) 75 one two 76 a 1 3 77 b 2 4 78 79 Returns 80 ------- 81 unstacked : DataFrame 82 """ 83 84 def __init__(self, index: MultiIndex, level=-1, constructor=None): 85 86 if constructor is None: 87 constructor = DataFrame 88 self.constructor = constructor 89 90 self.index = index.remove_unused_levels() 91 92 self.level = self.index._get_level_number(level) 93 94 # when index includes `nan`, need to lift levels/strides by 1 95 self.lift = 1 if -1 in self.index.codes[self.level] else 0 96 97 # Note: the "pop" below alters these in-place. 98 self.new_index_levels = list(self.index.levels) 99 self.new_index_names = list(self.index.names) 100 101 self.removed_name = self.new_index_names.pop(self.level) 102 self.removed_level = self.new_index_levels.pop(self.level) 103 self.removed_level_full = index.levels[self.level] 104 105 # Bug fix GH 20601 106 # If the data frame is too big, the number of unique index combination 107 # will cause int32 overflow on windows environments. 108 # We want to check and raise an error before this happens 109 num_rows = np.max([index_level.size for index_level in self.new_index_levels]) 110 num_columns = self.removed_level.size 111 112 # GH20601: This forces an overflow if the number of cells is too high. 113 num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) 114 115 if num_rows > 0 and num_columns > 0 and num_cells <= 0: 116 raise ValueError("Unstacked DataFrame is too big, causing int32 overflow") 117 118 self._make_selectors() 119 120 @cache_readonly 121 def _indexer_and_to_sort(self): 122 v = self.level 123 124 codes = list(self.index.codes) 125 levs = list(self.index.levels) 126 to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] 127 sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] 128 129 comp_index, obs_ids = get_compressed_ids(to_sort, sizes) 130 ngroups = len(obs_ids) 131 132 indexer = libalgos.groupsort_indexer(comp_index, ngroups)[0] 133 indexer = ensure_platform_int(indexer) 134 135 return indexer, to_sort 136 137 @cache_readonly 138 def sorted_labels(self): 139 indexer, to_sort = self._indexer_and_to_sort 140 return [line.take(indexer) for line in to_sort] 141 142 def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: 143 indexer, _ = self._indexer_and_to_sort 144 145 sorted_values = algos.take_nd(values, indexer, axis=0) 146 return sorted_values 147 148 def _make_selectors(self): 149 new_levels = self.new_index_levels 150 151 # make the mask 152 remaining_labels = self.sorted_labels[:-1] 153 level_sizes = [len(x) for x in new_levels] 154 155 comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) 156 ngroups = len(obs_ids) 157 158 comp_index = ensure_platform_int(comp_index) 159 stride = self.index.levshape[self.level] + self.lift 160 self.full_shape = ngroups, stride 161 162 selector = self.sorted_labels[-1] + stride * comp_index + self.lift 163 mask = np.zeros(np.prod(self.full_shape), dtype=bool) 164 mask.put(selector, True) 165 166 if mask.sum() < len(self.index): 167 raise ValueError("Index contains duplicate entries, cannot reshape") 168 169 self.group_index = comp_index 170 self.mask = mask 171 self.unique_groups = obs_ids 172 self.compressor = comp_index.searchsorted(np.arange(ngroups)) 173 174 def get_result(self, values, value_columns, fill_value): 175 176 if values.ndim == 1: 177 values = values[:, np.newaxis] 178 179 if value_columns is None and values.shape[1] != 1: # pragma: no cover 180 raise ValueError("must pass column labels for multi-column data") 181 182 values, _ = self.get_new_values(values, fill_value) 183 columns = self.get_new_columns(value_columns) 184 index = self.new_index 185 186 return self.constructor(values, index=index, columns=columns) 187 188 def get_new_values(self, values, fill_value=None): 189 190 if values.ndim == 1: 191 values = values[:, np.newaxis] 192 193 sorted_values = self._make_sorted_values(values) 194 195 # place the values 196 length, width = self.full_shape 197 stride = values.shape[1] 198 result_width = width * stride 199 result_shape = (length, result_width) 200 mask = self.mask 201 mask_all = mask.all() 202 203 # we can simply reshape if we don't have a mask 204 if mask_all and len(values): 205 # TODO: Under what circumstances can we rely on sorted_values 206 # matching values? When that holds, we can slice instead 207 # of take (in particular for EAs) 208 new_values = ( 209 sorted_values.reshape(length, width, stride) 210 .swapaxes(1, 2) 211 .reshape(result_shape) 212 ) 213 new_mask = np.ones(result_shape, dtype=bool) 214 return new_values, new_mask 215 216 # if our mask is all True, then we can use our existing dtype 217 if mask_all: 218 dtype = values.dtype 219 new_values = np.empty(result_shape, dtype=dtype) 220 else: 221 dtype, fill_value = maybe_promote(values.dtype, fill_value) 222 new_values = np.empty(result_shape, dtype=dtype) 223 new_values.fill(fill_value) 224 225 new_mask = np.zeros(result_shape, dtype=bool) 226 227 name = np.dtype(dtype).name 228 229 # we need to convert to a basic dtype 230 # and possibly coerce an input to our output dtype 231 # e.g. ints -> floats 232 if needs_i8_conversion(values.dtype): 233 sorted_values = sorted_values.view("i8") 234 new_values = new_values.view("i8") 235 elif is_bool_dtype(values.dtype): 236 sorted_values = sorted_values.astype("object") 237 new_values = new_values.astype("object") 238 else: 239 sorted_values = sorted_values.astype(name, copy=False) 240 241 # fill in our values & mask 242 libreshape.unstack( 243 sorted_values, 244 mask.view("u1"), 245 stride, 246 length, 247 width, 248 new_values, 249 new_mask.view("u1"), 250 ) 251 252 # reconstruct dtype if needed 253 if needs_i8_conversion(values.dtype): 254 new_values = new_values.view(values.dtype) 255 256 return new_values, new_mask 257 258 def get_new_columns(self, value_columns): 259 if value_columns is None: 260 if self.lift == 0: 261 return self.removed_level._shallow_copy(name=self.removed_name) 262 263 lev = self.removed_level.insert(0, item=self.removed_level._na_value) 264 return lev.rename(self.removed_name) 265 266 stride = len(self.removed_level) + self.lift 267 width = len(value_columns) 268 propagator = np.repeat(np.arange(width), stride) 269 if isinstance(value_columns, MultiIndex): 270 new_levels = value_columns.levels + (self.removed_level_full,) 271 new_names = value_columns.names + (self.removed_name,) 272 273 new_codes = [lab.take(propagator) for lab in value_columns.codes] 274 else: 275 new_levels = [value_columns, self.removed_level_full] 276 new_names = [value_columns.name, self.removed_name] 277 new_codes = [propagator] 278 279 # The two indices differ only if the unstacked level had unused items: 280 if len(self.removed_level_full) != len(self.removed_level): 281 # In this case, we remap the new codes to the original level: 282 repeater = self.removed_level_full.get_indexer(self.removed_level) 283 if self.lift: 284 repeater = np.insert(repeater, 0, -1) 285 else: 286 # Otherwise, we just use each level item exactly once: 287 repeater = np.arange(stride) - self.lift 288 289 # The entire level is then just a repetition of the single chunk: 290 new_codes.append(np.tile(repeater, width)) 291 return MultiIndex( 292 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 293 ) 294 295 @cache_readonly 296 def new_index(self): 297 # Does not depend on values or value_columns 298 result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] 299 300 # construct the new index 301 if len(self.new_index_levels) == 1: 302 level, level_codes = self.new_index_levels[0], result_codes[0] 303 if (level_codes == -1).any(): 304 level = level.insert(len(level), level._na_value) 305 return level.take(level_codes).rename(self.new_index_names[0]) 306 307 return MultiIndex( 308 levels=self.new_index_levels, 309 codes=result_codes, 310 names=self.new_index_names, 311 verify_integrity=False, 312 ) 313 314 315def _unstack_multiple(data, clocs, fill_value=None): 316 if len(clocs) == 0: 317 return data 318 319 # NOTE: This doesn't deal with hierarchical columns yet 320 321 index = data.index 322 323 # GH 19966 Make sure if MultiIndexed index has tuple name, they will be 324 # recognised as a whole 325 if clocs in index.names: 326 clocs = [clocs] 327 clocs = [index._get_level_number(i) for i in clocs] 328 329 rlocs = [i for i in range(index.nlevels) if i not in clocs] 330 331 clevels = [index.levels[i] for i in clocs] 332 ccodes = [index.codes[i] for i in clocs] 333 cnames = [index.names[i] for i in clocs] 334 rlevels = [index.levels[i] for i in rlocs] 335 rcodes = [index.codes[i] for i in rlocs] 336 rnames = [index.names[i] for i in rlocs] 337 338 shape = [len(x) for x in clevels] 339 group_index = get_group_index(ccodes, shape, sort=False, xnull=False) 340 341 comp_ids, obs_ids = compress_group_index(group_index, sort=False) 342 recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) 343 344 if not rlocs: 345 # Everything is in clocs, so the dummy df has a regular index 346 dummy_index = Index(obs_ids, name="__placeholder__") 347 else: 348 dummy_index = MultiIndex( 349 levels=rlevels + [obs_ids], 350 codes=rcodes + [comp_ids], 351 names=rnames + ["__placeholder__"], 352 verify_integrity=False, 353 ) 354 355 if isinstance(data, Series): 356 dummy = data.copy() 357 dummy.index = dummy_index 358 359 unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) 360 new_levels = clevels 361 new_names = cnames 362 new_codes = recons_codes 363 else: 364 if isinstance(data.columns, MultiIndex): 365 result = data 366 for i in range(len(clocs)): 367 val = clocs[i] 368 result = result.unstack(val, fill_value=fill_value) 369 clocs = [v if v < val else v - 1 for v in clocs] 370 371 return result 372 373 dummy = data.copy() 374 dummy.index = dummy_index 375 376 unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) 377 if isinstance(unstacked, Series): 378 unstcols = unstacked.index 379 else: 380 unstcols = unstacked.columns 381 assert isinstance(unstcols, MultiIndex) # for mypy 382 new_levels = [unstcols.levels[0]] + clevels 383 new_names = [data.columns.name] + cnames 384 385 new_codes = [unstcols.codes[0]] 386 for rec in recons_codes: 387 new_codes.append(rec.take(unstcols.codes[-1])) 388 389 new_columns = MultiIndex( 390 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 391 ) 392 393 if isinstance(unstacked, Series): 394 unstacked.index = new_columns 395 else: 396 unstacked.columns = new_columns 397 398 return unstacked 399 400 401def unstack(obj, level, fill_value=None): 402 403 if isinstance(level, (tuple, list)): 404 if len(level) != 1: 405 # _unstack_multiple only handles MultiIndexes, 406 # and isn't needed for a single level 407 return _unstack_multiple(obj, level, fill_value=fill_value) 408 else: 409 level = level[0] 410 411 # Prioritize integer interpretation (GH #21677): 412 if not is_integer(level) and not level == "__placeholder__": 413 level = obj.index._get_level_number(level) 414 415 if isinstance(obj, DataFrame): 416 if isinstance(obj.index, MultiIndex): 417 return _unstack_frame(obj, level, fill_value=fill_value) 418 else: 419 return obj.T.stack(dropna=False) 420 elif not isinstance(obj.index, MultiIndex): 421 # GH 36113 422 # Give nicer error messages when unstack a Series whose 423 # Index is not a MultiIndex. 424 raise ValueError( 425 f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" 426 ) 427 else: 428 if is_extension_array_dtype(obj.dtype): 429 return _unstack_extension_series(obj, level, fill_value) 430 unstacker = _Unstacker( 431 obj.index, level=level, constructor=obj._constructor_expanddim 432 ) 433 return unstacker.get_result( 434 obj.values, value_columns=None, fill_value=fill_value 435 ) 436 437 438def _unstack_frame(obj, level, fill_value=None): 439 if not obj._can_fast_transpose: 440 unstacker = _Unstacker(obj.index, level=level) 441 mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) 442 return obj._constructor(mgr) 443 else: 444 return _Unstacker( 445 obj.index, level=level, constructor=obj._constructor 446 ).get_result(obj._values, value_columns=obj.columns, fill_value=fill_value) 447 448 449def _unstack_extension_series(series, level, fill_value): 450 """ 451 Unstack an ExtensionArray-backed Series. 452 453 The ExtensionDtype is preserved. 454 455 Parameters 456 ---------- 457 series : Series 458 A Series with an ExtensionArray for values 459 level : Any 460 The level name or number. 461 fill_value : Any 462 The user-level (not physical storage) fill value to use for 463 missing values introduced by the reshape. Passed to 464 ``series.values.take``. 465 466 Returns 467 ------- 468 DataFrame 469 Each column of the DataFrame will have the same dtype as 470 the input Series. 471 """ 472 # Defer to the logic in ExtensionBlock._unstack 473 df = series.to_frame() 474 result = df.unstack(level=level, fill_value=fill_value) 475 return result.droplevel(level=0, axis=1) 476 477 478def stack(frame, level=-1, dropna=True): 479 """ 480 Convert DataFrame to Series with multi-level Index. Columns become the 481 second level of the resulting hierarchical index 482 483 Returns 484 ------- 485 stacked : Series 486 """ 487 488 def factorize(index): 489 if index.is_unique: 490 return index, np.arange(len(index)) 491 codes, categories = factorize_from_iterable(index) 492 return categories, codes 493 494 N, K = frame.shape 495 496 # Will also convert negative level numbers and check if out of bounds. 497 level_num = frame.columns._get_level_number(level) 498 499 if isinstance(frame.columns, MultiIndex): 500 return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) 501 elif isinstance(frame.index, MultiIndex): 502 new_levels = list(frame.index.levels) 503 new_codes = [lab.repeat(K) for lab in frame.index.codes] 504 505 clev, clab = factorize(frame.columns) 506 new_levels.append(clev) 507 new_codes.append(np.tile(clab, N).ravel()) 508 509 new_names = list(frame.index.names) 510 new_names.append(frame.columns.name) 511 new_index = MultiIndex( 512 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 513 ) 514 else: 515 levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) 516 codes = ilab.repeat(K), np.tile(clab, N).ravel() 517 new_index = MultiIndex( 518 levels=levels, 519 codes=codes, 520 names=[frame.index.name, frame.columns.name], 521 verify_integrity=False, 522 ) 523 524 if not frame.empty and frame._is_homogeneous_type: 525 # For homogeneous EAs, frame._values will coerce to object. So 526 # we concatenate instead. 527 dtypes = list(frame.dtypes._values) 528 dtype = dtypes[0] 529 530 if is_extension_array_dtype(dtype): 531 arr = dtype.construct_array_type() 532 new_values = arr._concat_same_type( 533 [col._values for _, col in frame.items()] 534 ) 535 new_values = _reorder_for_extension_array_stack(new_values, N, K) 536 else: 537 # homogeneous, non-EA 538 new_values = frame._values.ravel() 539 540 else: 541 # non-homogeneous 542 new_values = frame._values.ravel() 543 544 if dropna: 545 mask = notna(new_values) 546 new_values = new_values[mask] 547 new_index = new_index[mask] 548 549 return frame._constructor_sliced(new_values, index=new_index) 550 551 552def stack_multiple(frame, level, dropna=True): 553 # If all passed levels match up to column names, no 554 # ambiguity about what to do 555 if all(lev in frame.columns.names for lev in level): 556 result = frame 557 for lev in level: 558 result = stack(result, lev, dropna=dropna) 559 560 # Otherwise, level numbers may change as each successive level is stacked 561 elif all(isinstance(lev, int) for lev in level): 562 # As each stack is done, the level numbers decrease, so we need 563 # to account for that when level is a sequence of ints 564 result = frame 565 # _get_level_number() checks level numbers are in range and converts 566 # negative numbers to positive 567 level = [frame.columns._get_level_number(lev) for lev in level] 568 569 # Can't iterate directly through level as we might need to change 570 # values as we go 571 for index in range(len(level)): 572 lev = level[index] 573 result = stack(result, lev, dropna=dropna) 574 # Decrement all level numbers greater than current, as these 575 # have now shifted down by one 576 updated_level = [] 577 for other in level: 578 if other > lev: 579 updated_level.append(other - 1) 580 else: 581 updated_level.append(other) 582 level = updated_level 583 584 else: 585 raise ValueError( 586 "level should contain all level names or all level " 587 "numbers, not a mixture of the two." 588 ) 589 590 return result 591 592 593def _stack_multi_columns(frame, level_num=-1, dropna=True): 594 def _convert_level_number(level_num, columns): 595 """ 596 Logic for converting the level number to something we can safely pass 597 to swaplevel. 598 599 If `level_num` matches a column name return the name from 600 position `level_num`, otherwise return `level_num`. 601 """ 602 if level_num in columns.names: 603 return columns.names[level_num] 604 605 return level_num 606 607 this = frame.copy() 608 609 # this makes life much simpler 610 if level_num != frame.columns.nlevels - 1: 611 # roll levels to put selected level at end 612 roll_columns = this.columns 613 for i in range(level_num, frame.columns.nlevels - 1): 614 # Need to check if the ints conflict with level names 615 lev1 = _convert_level_number(i, roll_columns) 616 lev2 = _convert_level_number(i + 1, roll_columns) 617 roll_columns = roll_columns.swaplevel(lev1, lev2) 618 this.columns = roll_columns 619 620 if not this.columns.is_lexsorted(): 621 # Workaround the edge case where 0 is one of the column names, 622 # which interferes with trying to sort based on the first 623 # level 624 level_to_sort = _convert_level_number(0, this.columns) 625 this = this.sort_index(level=level_to_sort, axis=1) 626 627 # tuple list excluding level for grouping columns 628 if len(frame.columns.levels) > 2: 629 tuples = list( 630 zip( 631 *[ 632 lev.take(level_codes) 633 for lev, level_codes in zip( 634 this.columns.levels[:-1], this.columns.codes[:-1] 635 ) 636 ] 637 ) 638 ) 639 unique_groups = [key for key, _ in itertools.groupby(tuples)] 640 new_names = this.columns.names[:-1] 641 new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) 642 else: 643 new_columns = this.columns.levels[0]._shallow_copy(name=this.columns.names[0]) 644 unique_groups = new_columns 645 646 # time to ravel the values 647 new_data = {} 648 level_vals = this.columns.levels[-1] 649 level_codes = sorted(set(this.columns.codes[-1])) 650 level_vals_used = level_vals[level_codes] 651 levsize = len(level_codes) 652 drop_cols = [] 653 for key in unique_groups: 654 try: 655 loc = this.columns.get_loc(key) 656 except KeyError: 657 drop_cols.append(key) 658 continue 659 660 # can make more efficient? 661 # we almost always return a slice 662 # but if unsorted can get a boolean 663 # indexer 664 if not isinstance(loc, slice): 665 slice_len = len(loc) 666 else: 667 slice_len = loc.stop - loc.start 668 669 if slice_len != levsize: 670 chunk = this.loc[:, this.columns[loc]] 671 chunk.columns = level_vals.take(chunk.columns.codes[-1]) 672 value_slice = chunk.reindex(columns=level_vals_used).values 673 else: 674 if frame._is_homogeneous_type and is_extension_array_dtype( 675 frame.dtypes.iloc[0] 676 ): 677 dtype = this[this.columns[loc]].dtypes.iloc[0] 678 subset = this[this.columns[loc]] 679 680 value_slice = dtype.construct_array_type()._concat_same_type( 681 [x._values for _, x in subset.items()] 682 ) 683 N, K = this.shape 684 idx = np.arange(N * K).reshape(K, N).T.ravel() 685 value_slice = value_slice.take(idx) 686 687 elif frame._is_mixed_type: 688 value_slice = this[this.columns[loc]].values 689 else: 690 value_slice = this.values[:, loc] 691 692 if value_slice.ndim > 1: 693 # i.e. not extension 694 value_slice = value_slice.ravel() 695 696 new_data[key] = value_slice 697 698 if len(drop_cols) > 0: 699 new_columns = new_columns.difference(drop_cols) 700 701 N = len(this) 702 703 if isinstance(this.index, MultiIndex): 704 new_levels = list(this.index.levels) 705 new_names = list(this.index.names) 706 new_codes = [lab.repeat(levsize) for lab in this.index.codes] 707 else: 708 old_codes, old_levels = factorize_from_iterable(this.index) 709 new_levels = [old_levels] 710 new_codes = [old_codes.repeat(levsize)] 711 new_names = [this.index.name] # something better? 712 713 new_levels.append(level_vals) 714 new_codes.append(np.tile(level_codes, N)) 715 new_names.append(frame.columns.names[level_num]) 716 717 new_index = MultiIndex( 718 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 719 ) 720 721 result = frame._constructor(new_data, index=new_index, columns=new_columns) 722 723 # more efficient way to go about this? can do the whole masking biz but 724 # will only save a small amount of time... 725 if dropna: 726 result = result.dropna(axis=0, how="all") 727 728 return result 729 730 731def get_dummies( 732 data, 733 prefix=None, 734 prefix_sep="_", 735 dummy_na=False, 736 columns=None, 737 sparse=False, 738 drop_first=False, 739 dtype=None, 740) -> "DataFrame": 741 """ 742 Convert categorical variable into dummy/indicator variables. 743 744 Parameters 745 ---------- 746 data : array-like, Series, or DataFrame 747 Data of which to get dummy indicators. 748 prefix : str, list of str, or dict of str, default None 749 String to append DataFrame column names. 750 Pass a list with length equal to the number of columns 751 when calling get_dummies on a DataFrame. Alternatively, `prefix` 752 can be a dictionary mapping column names to prefixes. 753 prefix_sep : str, default '_' 754 If appending prefix, separator/delimiter to use. Or pass a 755 list or dictionary as with `prefix`. 756 dummy_na : bool, default False 757 Add a column to indicate NaNs, if False NaNs are ignored. 758 columns : list-like, default None 759 Column names in the DataFrame to be encoded. 760 If `columns` is None then all the columns with 761 `object` or `category` dtype will be converted. 762 sparse : bool, default False 763 Whether the dummy-encoded columns should be backed by 764 a :class:`SparseArray` (True) or a regular NumPy array (False). 765 drop_first : bool, default False 766 Whether to get k-1 dummies out of k categorical levels by removing the 767 first level. 768 dtype : dtype, default np.uint8 769 Data type for new columns. Only a single dtype is allowed. 770 771 Returns 772 ------- 773 DataFrame 774 Dummy-coded data. 775 776 See Also 777 -------- 778 Series.str.get_dummies : Convert Series to dummy codes. 779 780 Examples 781 -------- 782 >>> s = pd.Series(list('abca')) 783 784 >>> pd.get_dummies(s) 785 a b c 786 0 1 0 0 787 1 0 1 0 788 2 0 0 1 789 3 1 0 0 790 791 >>> s1 = ['a', 'b', np.nan] 792 793 >>> pd.get_dummies(s1) 794 a b 795 0 1 0 796 1 0 1 797 2 0 0 798 799 >>> pd.get_dummies(s1, dummy_na=True) 800 a b NaN 801 0 1 0 0 802 1 0 1 0 803 2 0 0 1 804 805 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 806 ... 'C': [1, 2, 3]}) 807 808 >>> pd.get_dummies(df, prefix=['col1', 'col2']) 809 C col1_a col1_b col2_a col2_b col2_c 810 0 1 1 0 0 1 0 811 1 2 0 1 1 0 0 812 2 3 1 0 0 0 1 813 814 >>> pd.get_dummies(pd.Series(list('abcaa'))) 815 a b c 816 0 1 0 0 817 1 0 1 0 818 2 0 0 1 819 3 1 0 0 820 4 1 0 0 821 822 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) 823 b c 824 0 0 0 825 1 1 0 826 2 0 1 827 3 0 0 828 4 0 0 829 830 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) 831 a b c 832 0 1.0 0.0 0.0 833 1 0.0 1.0 0.0 834 2 0.0 0.0 1.0 835 """ 836 from pandas.core.reshape.concat import concat 837 838 dtypes_to_encode = ["object", "category"] 839 840 if isinstance(data, DataFrame): 841 # determine columns being encoded 842 if columns is None: 843 data_to_encode = data.select_dtypes(include=dtypes_to_encode) 844 elif not is_list_like(columns): 845 raise TypeError("Input must be a list-like for parameter `columns`") 846 else: 847 data_to_encode = data[columns] 848 849 # validate prefixes and separator to avoid silently dropping cols 850 def check_len(item, name): 851 852 if is_list_like(item): 853 if not len(item) == data_to_encode.shape[1]: 854 len_msg = ( 855 f"Length of '{name}' ({len(item)}) did not match the " 856 "length of the columns being encoded " 857 f"({data_to_encode.shape[1]})." 858 ) 859 raise ValueError(len_msg) 860 861 check_len(prefix, "prefix") 862 check_len(prefix_sep, "prefix_sep") 863 864 if isinstance(prefix, str): 865 prefix = itertools.cycle([prefix]) 866 if isinstance(prefix, dict): 867 prefix = [prefix[col] for col in data_to_encode.columns] 868 869 if prefix is None: 870 prefix = data_to_encode.columns 871 872 # validate separators 873 if isinstance(prefix_sep, str): 874 prefix_sep = itertools.cycle([prefix_sep]) 875 elif isinstance(prefix_sep, dict): 876 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] 877 878 with_dummies: List[DataFrame] 879 if data_to_encode.shape == data.shape: 880 # Encoding the entire df, do not prepend any dropped columns 881 with_dummies = [] 882 elif columns is not None: 883 # Encoding only cols specified in columns. Get all cols not in 884 # columns to prepend to result. 885 with_dummies = [data.drop(columns, axis=1)] 886 else: 887 # Encoding only object and category dtype columns. Get remaining 888 # columns to prepend to result. 889 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] 890 891 for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep): 892 # col is (column_name, column), use just column data here 893 dummy = _get_dummies_1d( 894 col[1], 895 prefix=pre, 896 prefix_sep=sep, 897 dummy_na=dummy_na, 898 sparse=sparse, 899 drop_first=drop_first, 900 dtype=dtype, 901 ) 902 with_dummies.append(dummy) 903 result = concat(with_dummies, axis=1) 904 else: 905 result = _get_dummies_1d( 906 data, 907 prefix, 908 prefix_sep, 909 dummy_na, 910 sparse=sparse, 911 drop_first=drop_first, 912 dtype=dtype, 913 ) 914 return result 915 916 917def _get_dummies_1d( 918 data, 919 prefix, 920 prefix_sep="_", 921 dummy_na=False, 922 sparse=False, 923 drop_first=False, 924 dtype=None, 925): 926 from pandas.core.reshape.concat import concat 927 928 # Series avoids inconsistent NaN handling 929 codes, levels = factorize_from_iterable(Series(data)) 930 931 if dtype is None: 932 dtype = np.uint8 933 dtype = np.dtype(dtype) 934 935 if is_object_dtype(dtype): 936 raise ValueError("dtype=object is not a valid dtype for get_dummies") 937 938 def get_empty_frame(data) -> DataFrame: 939 if isinstance(data, Series): 940 index = data.index 941 else: 942 index = np.arange(len(data)) 943 return DataFrame(index=index) 944 945 # if all NaN 946 if not dummy_na and len(levels) == 0: 947 return get_empty_frame(data) 948 949 codes = codes.copy() 950 if dummy_na: 951 codes[codes == -1] = len(levels) 952 levels = np.append(levels, np.nan) 953 954 # if dummy_na, we just fake a nan level. drop_first will drop it again 955 if drop_first and len(levels) == 1: 956 return get_empty_frame(data) 957 958 number_of_cols = len(levels) 959 960 if prefix is None: 961 dummy_cols = levels 962 else: 963 dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels] 964 965 index: Optional[Index] 966 if isinstance(data, Series): 967 index = data.index 968 else: 969 index = None 970 971 if sparse: 972 973 fill_value: Union[bool, float, int] 974 if is_integer_dtype(dtype): 975 fill_value = 0 976 elif dtype == bool: 977 fill_value = False 978 else: 979 fill_value = 0.0 980 981 sparse_series = [] 982 N = len(data) 983 sp_indices: List[List] = [[] for _ in range(len(dummy_cols))] 984 mask = codes != -1 985 codes = codes[mask] 986 n_idx = np.arange(N)[mask] 987 988 for ndx, code in zip(n_idx, codes): 989 sp_indices[code].append(ndx) 990 991 if drop_first: 992 # remove first categorical level to avoid perfect collinearity 993 # GH12042 994 sp_indices = sp_indices[1:] 995 dummy_cols = dummy_cols[1:] 996 for col, ixs in zip(dummy_cols, sp_indices): 997 sarr = SparseArray( 998 np.ones(len(ixs), dtype=dtype), 999 sparse_index=IntIndex(N, ixs), 1000 fill_value=fill_value, 1001 dtype=dtype, 1002 ) 1003 sparse_series.append(Series(data=sarr, index=index, name=col)) 1004 1005 out = concat(sparse_series, axis=1, copy=False) 1006 return out 1007 1008 else: 1009 dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) 1010 1011 if not dummy_na: 1012 # reset NaN GH4446 1013 dummy_mat[codes == -1] = 0 1014 1015 if drop_first: 1016 # remove first GH12042 1017 dummy_mat = dummy_mat[:, 1:] 1018 dummy_cols = dummy_cols[1:] 1019 return DataFrame(dummy_mat, index=index, columns=dummy_cols) 1020 1021 1022def _reorder_for_extension_array_stack(arr, n_rows: int, n_columns: int): 1023 """ 1024 Re-orders the values when stacking multiple extension-arrays. 1025 1026 The indirect stacking method used for EAs requires a followup 1027 take to get the order correct. 1028 1029 Parameters 1030 ---------- 1031 arr : ExtensionArray 1032 n_rows, n_columns : int 1033 The number of rows and columns in the original DataFrame. 1034 1035 Returns 1036 ------- 1037 taken : ExtensionArray 1038 The original `arr` with elements re-ordered appropriately 1039 1040 Examples 1041 -------- 1042 >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) 1043 >>> _reorder_for_extension_array_stack(arr, 2, 3) 1044 array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1') 1045 1046 >>> _reorder_for_extension_array_stack(arr, 3, 2) 1047 array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1') 1048 """ 1049 # final take to get the order correct. 1050 # idx is an indexer like 1051 # [c0r0, c1r0, c2r0, ..., 1052 # c0r1, c1r1, c2r1, ...] 1053 idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() 1054 return arr.take(idx) 1055