1import itertools
2from typing import List, Optional, Union
3
4import numpy as np
5
6import pandas._libs.algos as libalgos
7import pandas._libs.reshape as libreshape
8from pandas._libs.sparse import IntIndex
9from pandas.util._decorators import cache_readonly
10
11from pandas.core.dtypes.cast import maybe_promote
12from pandas.core.dtypes.common import (
13    ensure_platform_int,
14    is_bool_dtype,
15    is_extension_array_dtype,
16    is_integer,
17    is_integer_dtype,
18    is_list_like,
19    is_object_dtype,
20    needs_i8_conversion,
21)
22from pandas.core.dtypes.missing import notna
23
24import pandas.core.algorithms as algos
25from pandas.core.arrays import SparseArray
26from pandas.core.arrays.categorical import factorize_from_iterable
27from pandas.core.frame import DataFrame
28from pandas.core.indexes.api import Index, MultiIndex
29from pandas.core.series import Series
30from pandas.core.sorting import (
31    compress_group_index,
32    decons_obs_group_ids,
33    get_compressed_ids,
34    get_group_index,
35)
36
37
38class _Unstacker:
39    """
40    Helper class to unstack data / pivot with multi-level index
41
42    Parameters
43    ----------
44    index : MultiIndex
45    level : int or str, default last level
46        Level to "unstack". Accepts a name for the level.
47    fill_value : scalar, optional
48        Default value to fill in missing values if subgroups do not have the
49        same set of labels. By default, missing values will be replaced with
50        the default fill value for that data type, NaN for float, NaT for
51        datetimelike, etc. For integer types, by default data will converted to
52        float and missing values will be set to NaN.
53    constructor : object
54        Pandas ``DataFrame`` or subclass used to create unstacked
55        response.  If None, DataFrame will be used.
56
57    Examples
58    --------
59    >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
60    ...                                    ('two', 'a'), ('two', 'b')])
61    >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
62    >>> s
63    one  a    1
64         b    2
65    two  a    3
66         b    4
67    dtype: int64
68
69    >>> s.unstack(level=-1)
70         a  b
71    one  1  2
72    two  3  4
73
74    >>> s.unstack(level=0)
75       one  two
76    a    1    3
77    b    2    4
78
79    Returns
80    -------
81    unstacked : DataFrame
82    """
83
84    def __init__(self, index: MultiIndex, level=-1, constructor=None):
85
86        if constructor is None:
87            constructor = DataFrame
88        self.constructor = constructor
89
90        self.index = index.remove_unused_levels()
91
92        self.level = self.index._get_level_number(level)
93
94        # when index includes `nan`, need to lift levels/strides by 1
95        self.lift = 1 if -1 in self.index.codes[self.level] else 0
96
97        # Note: the "pop" below alters these in-place.
98        self.new_index_levels = list(self.index.levels)
99        self.new_index_names = list(self.index.names)
100
101        self.removed_name = self.new_index_names.pop(self.level)
102        self.removed_level = self.new_index_levels.pop(self.level)
103        self.removed_level_full = index.levels[self.level]
104
105        # Bug fix GH 20601
106        # If the data frame is too big, the number of unique index combination
107        # will cause int32 overflow on windows environments.
108        # We want to check and raise an error before this happens
109        num_rows = np.max([index_level.size for index_level in self.new_index_levels])
110        num_columns = self.removed_level.size
111
112        # GH20601: This forces an overflow if the number of cells is too high.
113        num_cells = np.multiply(num_rows, num_columns, dtype=np.int32)
114
115        if num_rows > 0 and num_columns > 0 and num_cells <= 0:
116            raise ValueError("Unstacked DataFrame is too big, causing int32 overflow")
117
118        self._make_selectors()
119
120    @cache_readonly
121    def _indexer_and_to_sort(self):
122        v = self.level
123
124        codes = list(self.index.codes)
125        levs = list(self.index.levels)
126        to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
127        sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]]
128
129        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
130        ngroups = len(obs_ids)
131
132        indexer = libalgos.groupsort_indexer(comp_index, ngroups)[0]
133        indexer = ensure_platform_int(indexer)
134
135        return indexer, to_sort
136
137    @cache_readonly
138    def sorted_labels(self):
139        indexer, to_sort = self._indexer_and_to_sort
140        return [line.take(indexer) for line in to_sort]
141
142    def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
143        indexer, _ = self._indexer_and_to_sort
144
145        sorted_values = algos.take_nd(values, indexer, axis=0)
146        return sorted_values
147
148    def _make_selectors(self):
149        new_levels = self.new_index_levels
150
151        # make the mask
152        remaining_labels = self.sorted_labels[:-1]
153        level_sizes = [len(x) for x in new_levels]
154
155        comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
156        ngroups = len(obs_ids)
157
158        comp_index = ensure_platform_int(comp_index)
159        stride = self.index.levshape[self.level] + self.lift
160        self.full_shape = ngroups, stride
161
162        selector = self.sorted_labels[-1] + stride * comp_index + self.lift
163        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
164        mask.put(selector, True)
165
166        if mask.sum() < len(self.index):
167            raise ValueError("Index contains duplicate entries, cannot reshape")
168
169        self.group_index = comp_index
170        self.mask = mask
171        self.unique_groups = obs_ids
172        self.compressor = comp_index.searchsorted(np.arange(ngroups))
173
174    def get_result(self, values, value_columns, fill_value):
175
176        if values.ndim == 1:
177            values = values[:, np.newaxis]
178
179        if value_columns is None and values.shape[1] != 1:  # pragma: no cover
180            raise ValueError("must pass column labels for multi-column data")
181
182        values, _ = self.get_new_values(values, fill_value)
183        columns = self.get_new_columns(value_columns)
184        index = self.new_index
185
186        return self.constructor(values, index=index, columns=columns)
187
188    def get_new_values(self, values, fill_value=None):
189
190        if values.ndim == 1:
191            values = values[:, np.newaxis]
192
193        sorted_values = self._make_sorted_values(values)
194
195        # place the values
196        length, width = self.full_shape
197        stride = values.shape[1]
198        result_width = width * stride
199        result_shape = (length, result_width)
200        mask = self.mask
201        mask_all = mask.all()
202
203        # we can simply reshape if we don't have a mask
204        if mask_all and len(values):
205            # TODO: Under what circumstances can we rely on sorted_values
206            #  matching values?  When that holds, we can slice instead
207            #  of take (in particular for EAs)
208            new_values = (
209                sorted_values.reshape(length, width, stride)
210                .swapaxes(1, 2)
211                .reshape(result_shape)
212            )
213            new_mask = np.ones(result_shape, dtype=bool)
214            return new_values, new_mask
215
216        # if our mask is all True, then we can use our existing dtype
217        if mask_all:
218            dtype = values.dtype
219            new_values = np.empty(result_shape, dtype=dtype)
220        else:
221            dtype, fill_value = maybe_promote(values.dtype, fill_value)
222            new_values = np.empty(result_shape, dtype=dtype)
223            new_values.fill(fill_value)
224
225        new_mask = np.zeros(result_shape, dtype=bool)
226
227        name = np.dtype(dtype).name
228
229        # we need to convert to a basic dtype
230        # and possibly coerce an input to our output dtype
231        # e.g. ints -> floats
232        if needs_i8_conversion(values.dtype):
233            sorted_values = sorted_values.view("i8")
234            new_values = new_values.view("i8")
235        elif is_bool_dtype(values.dtype):
236            sorted_values = sorted_values.astype("object")
237            new_values = new_values.astype("object")
238        else:
239            sorted_values = sorted_values.astype(name, copy=False)
240
241        # fill in our values & mask
242        libreshape.unstack(
243            sorted_values,
244            mask.view("u1"),
245            stride,
246            length,
247            width,
248            new_values,
249            new_mask.view("u1"),
250        )
251
252        # reconstruct dtype if needed
253        if needs_i8_conversion(values.dtype):
254            new_values = new_values.view(values.dtype)
255
256        return new_values, new_mask
257
258    def get_new_columns(self, value_columns):
259        if value_columns is None:
260            if self.lift == 0:
261                return self.removed_level._shallow_copy(name=self.removed_name)
262
263            lev = self.removed_level.insert(0, item=self.removed_level._na_value)
264            return lev.rename(self.removed_name)
265
266        stride = len(self.removed_level) + self.lift
267        width = len(value_columns)
268        propagator = np.repeat(np.arange(width), stride)
269        if isinstance(value_columns, MultiIndex):
270            new_levels = value_columns.levels + (self.removed_level_full,)
271            new_names = value_columns.names + (self.removed_name,)
272
273            new_codes = [lab.take(propagator) for lab in value_columns.codes]
274        else:
275            new_levels = [value_columns, self.removed_level_full]
276            new_names = [value_columns.name, self.removed_name]
277            new_codes = [propagator]
278
279        # The two indices differ only if the unstacked level had unused items:
280        if len(self.removed_level_full) != len(self.removed_level):
281            # In this case, we remap the new codes to the original level:
282            repeater = self.removed_level_full.get_indexer(self.removed_level)
283            if self.lift:
284                repeater = np.insert(repeater, 0, -1)
285        else:
286            # Otherwise, we just use each level item exactly once:
287            repeater = np.arange(stride) - self.lift
288
289        # The entire level is then just a repetition of the single chunk:
290        new_codes.append(np.tile(repeater, width))
291        return MultiIndex(
292            levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
293        )
294
295    @cache_readonly
296    def new_index(self):
297        # Does not depend on values or value_columns
298        result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
299
300        # construct the new index
301        if len(self.new_index_levels) == 1:
302            level, level_codes = self.new_index_levels[0], result_codes[0]
303            if (level_codes == -1).any():
304                level = level.insert(len(level), level._na_value)
305            return level.take(level_codes).rename(self.new_index_names[0])
306
307        return MultiIndex(
308            levels=self.new_index_levels,
309            codes=result_codes,
310            names=self.new_index_names,
311            verify_integrity=False,
312        )
313
314
315def _unstack_multiple(data, clocs, fill_value=None):
316    if len(clocs) == 0:
317        return data
318
319    # NOTE: This doesn't deal with hierarchical columns yet
320
321    index = data.index
322
323    # GH 19966 Make sure if MultiIndexed index has tuple name, they will be
324    # recognised as a whole
325    if clocs in index.names:
326        clocs = [clocs]
327    clocs = [index._get_level_number(i) for i in clocs]
328
329    rlocs = [i for i in range(index.nlevels) if i not in clocs]
330
331    clevels = [index.levels[i] for i in clocs]
332    ccodes = [index.codes[i] for i in clocs]
333    cnames = [index.names[i] for i in clocs]
334    rlevels = [index.levels[i] for i in rlocs]
335    rcodes = [index.codes[i] for i in rlocs]
336    rnames = [index.names[i] for i in rlocs]
337
338    shape = [len(x) for x in clevels]
339    group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
340
341    comp_ids, obs_ids = compress_group_index(group_index, sort=False)
342    recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
343
344    if not rlocs:
345        # Everything is in clocs, so the dummy df has a regular index
346        dummy_index = Index(obs_ids, name="__placeholder__")
347    else:
348        dummy_index = MultiIndex(
349            levels=rlevels + [obs_ids],
350            codes=rcodes + [comp_ids],
351            names=rnames + ["__placeholder__"],
352            verify_integrity=False,
353        )
354
355    if isinstance(data, Series):
356        dummy = data.copy()
357        dummy.index = dummy_index
358
359        unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
360        new_levels = clevels
361        new_names = cnames
362        new_codes = recons_codes
363    else:
364        if isinstance(data.columns, MultiIndex):
365            result = data
366            for i in range(len(clocs)):
367                val = clocs[i]
368                result = result.unstack(val, fill_value=fill_value)
369                clocs = [v if v < val else v - 1 for v in clocs]
370
371            return result
372
373        dummy = data.copy()
374        dummy.index = dummy_index
375
376        unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
377        if isinstance(unstacked, Series):
378            unstcols = unstacked.index
379        else:
380            unstcols = unstacked.columns
381        assert isinstance(unstcols, MultiIndex)  # for mypy
382        new_levels = [unstcols.levels[0]] + clevels
383        new_names = [data.columns.name] + cnames
384
385        new_codes = [unstcols.codes[0]]
386        for rec in recons_codes:
387            new_codes.append(rec.take(unstcols.codes[-1]))
388
389    new_columns = MultiIndex(
390        levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
391    )
392
393    if isinstance(unstacked, Series):
394        unstacked.index = new_columns
395    else:
396        unstacked.columns = new_columns
397
398    return unstacked
399
400
401def unstack(obj, level, fill_value=None):
402
403    if isinstance(level, (tuple, list)):
404        if len(level) != 1:
405            # _unstack_multiple only handles MultiIndexes,
406            # and isn't needed for a single level
407            return _unstack_multiple(obj, level, fill_value=fill_value)
408        else:
409            level = level[0]
410
411    # Prioritize integer interpretation (GH #21677):
412    if not is_integer(level) and not level == "__placeholder__":
413        level = obj.index._get_level_number(level)
414
415    if isinstance(obj, DataFrame):
416        if isinstance(obj.index, MultiIndex):
417            return _unstack_frame(obj, level, fill_value=fill_value)
418        else:
419            return obj.T.stack(dropna=False)
420    elif not isinstance(obj.index, MultiIndex):
421        # GH 36113
422        # Give nicer error messages when unstack a  Series whose
423        # Index is not a MultiIndex.
424        raise ValueError(
425            f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
426        )
427    else:
428        if is_extension_array_dtype(obj.dtype):
429            return _unstack_extension_series(obj, level, fill_value)
430        unstacker = _Unstacker(
431            obj.index, level=level, constructor=obj._constructor_expanddim
432        )
433        return unstacker.get_result(
434            obj.values, value_columns=None, fill_value=fill_value
435        )
436
437
438def _unstack_frame(obj, level, fill_value=None):
439    if not obj._can_fast_transpose:
440        unstacker = _Unstacker(obj.index, level=level)
441        mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
442        return obj._constructor(mgr)
443    else:
444        return _Unstacker(
445            obj.index, level=level, constructor=obj._constructor
446        ).get_result(obj._values, value_columns=obj.columns, fill_value=fill_value)
447
448
449def _unstack_extension_series(series, level, fill_value):
450    """
451    Unstack an ExtensionArray-backed Series.
452
453    The ExtensionDtype is preserved.
454
455    Parameters
456    ----------
457    series : Series
458        A Series with an ExtensionArray for values
459    level : Any
460        The level name or number.
461    fill_value : Any
462        The user-level (not physical storage) fill value to use for
463        missing values introduced by the reshape. Passed to
464        ``series.values.take``.
465
466    Returns
467    -------
468    DataFrame
469        Each column of the DataFrame will have the same dtype as
470        the input Series.
471    """
472    # Defer to the logic in ExtensionBlock._unstack
473    df = series.to_frame()
474    result = df.unstack(level=level, fill_value=fill_value)
475    return result.droplevel(level=0, axis=1)
476
477
478def stack(frame, level=-1, dropna=True):
479    """
480    Convert DataFrame to Series with multi-level Index. Columns become the
481    second level of the resulting hierarchical index
482
483    Returns
484    -------
485    stacked : Series
486    """
487
488    def factorize(index):
489        if index.is_unique:
490            return index, np.arange(len(index))
491        codes, categories = factorize_from_iterable(index)
492        return categories, codes
493
494    N, K = frame.shape
495
496    # Will also convert negative level numbers and check if out of bounds.
497    level_num = frame.columns._get_level_number(level)
498
499    if isinstance(frame.columns, MultiIndex):
500        return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
501    elif isinstance(frame.index, MultiIndex):
502        new_levels = list(frame.index.levels)
503        new_codes = [lab.repeat(K) for lab in frame.index.codes]
504
505        clev, clab = factorize(frame.columns)
506        new_levels.append(clev)
507        new_codes.append(np.tile(clab, N).ravel())
508
509        new_names = list(frame.index.names)
510        new_names.append(frame.columns.name)
511        new_index = MultiIndex(
512            levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
513        )
514    else:
515        levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns)))
516        codes = ilab.repeat(K), np.tile(clab, N).ravel()
517        new_index = MultiIndex(
518            levels=levels,
519            codes=codes,
520            names=[frame.index.name, frame.columns.name],
521            verify_integrity=False,
522        )
523
524    if not frame.empty and frame._is_homogeneous_type:
525        # For homogeneous EAs, frame._values will coerce to object. So
526        # we concatenate instead.
527        dtypes = list(frame.dtypes._values)
528        dtype = dtypes[0]
529
530        if is_extension_array_dtype(dtype):
531            arr = dtype.construct_array_type()
532            new_values = arr._concat_same_type(
533                [col._values for _, col in frame.items()]
534            )
535            new_values = _reorder_for_extension_array_stack(new_values, N, K)
536        else:
537            # homogeneous, non-EA
538            new_values = frame._values.ravel()
539
540    else:
541        # non-homogeneous
542        new_values = frame._values.ravel()
543
544    if dropna:
545        mask = notna(new_values)
546        new_values = new_values[mask]
547        new_index = new_index[mask]
548
549    return frame._constructor_sliced(new_values, index=new_index)
550
551
552def stack_multiple(frame, level, dropna=True):
553    # If all passed levels match up to column names, no
554    # ambiguity about what to do
555    if all(lev in frame.columns.names for lev in level):
556        result = frame
557        for lev in level:
558            result = stack(result, lev, dropna=dropna)
559
560    # Otherwise, level numbers may change as each successive level is stacked
561    elif all(isinstance(lev, int) for lev in level):
562        # As each stack is done, the level numbers decrease, so we need
563        #  to account for that when level is a sequence of ints
564        result = frame
565        # _get_level_number() checks level numbers are in range and converts
566        # negative numbers to positive
567        level = [frame.columns._get_level_number(lev) for lev in level]
568
569        # Can't iterate directly through level as we might need to change
570        # values as we go
571        for index in range(len(level)):
572            lev = level[index]
573            result = stack(result, lev, dropna=dropna)
574            # Decrement all level numbers greater than current, as these
575            # have now shifted down by one
576            updated_level = []
577            for other in level:
578                if other > lev:
579                    updated_level.append(other - 1)
580                else:
581                    updated_level.append(other)
582            level = updated_level
583
584    else:
585        raise ValueError(
586            "level should contain all level names or all level "
587            "numbers, not a mixture of the two."
588        )
589
590    return result
591
592
593def _stack_multi_columns(frame, level_num=-1, dropna=True):
594    def _convert_level_number(level_num, columns):
595        """
596        Logic for converting the level number to something we can safely pass
597        to swaplevel.
598
599        If `level_num` matches a column name return the name from
600        position `level_num`, otherwise return `level_num`.
601        """
602        if level_num in columns.names:
603            return columns.names[level_num]
604
605        return level_num
606
607    this = frame.copy()
608
609    # this makes life much simpler
610    if level_num != frame.columns.nlevels - 1:
611        # roll levels to put selected level at end
612        roll_columns = this.columns
613        for i in range(level_num, frame.columns.nlevels - 1):
614            # Need to check if the ints conflict with level names
615            lev1 = _convert_level_number(i, roll_columns)
616            lev2 = _convert_level_number(i + 1, roll_columns)
617            roll_columns = roll_columns.swaplevel(lev1, lev2)
618        this.columns = roll_columns
619
620    if not this.columns.is_lexsorted():
621        # Workaround the edge case where 0 is one of the column names,
622        # which interferes with trying to sort based on the first
623        # level
624        level_to_sort = _convert_level_number(0, this.columns)
625        this = this.sort_index(level=level_to_sort, axis=1)
626
627    # tuple list excluding level for grouping columns
628    if len(frame.columns.levels) > 2:
629        tuples = list(
630            zip(
631                *[
632                    lev.take(level_codes)
633                    for lev, level_codes in zip(
634                        this.columns.levels[:-1], this.columns.codes[:-1]
635                    )
636                ]
637            )
638        )
639        unique_groups = [key for key, _ in itertools.groupby(tuples)]
640        new_names = this.columns.names[:-1]
641        new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
642    else:
643        new_columns = this.columns.levels[0]._shallow_copy(name=this.columns.names[0])
644        unique_groups = new_columns
645
646    # time to ravel the values
647    new_data = {}
648    level_vals = this.columns.levels[-1]
649    level_codes = sorted(set(this.columns.codes[-1]))
650    level_vals_used = level_vals[level_codes]
651    levsize = len(level_codes)
652    drop_cols = []
653    for key in unique_groups:
654        try:
655            loc = this.columns.get_loc(key)
656        except KeyError:
657            drop_cols.append(key)
658            continue
659
660        # can make more efficient?
661        # we almost always return a slice
662        # but if unsorted can get a boolean
663        # indexer
664        if not isinstance(loc, slice):
665            slice_len = len(loc)
666        else:
667            slice_len = loc.stop - loc.start
668
669        if slice_len != levsize:
670            chunk = this.loc[:, this.columns[loc]]
671            chunk.columns = level_vals.take(chunk.columns.codes[-1])
672            value_slice = chunk.reindex(columns=level_vals_used).values
673        else:
674            if frame._is_homogeneous_type and is_extension_array_dtype(
675                frame.dtypes.iloc[0]
676            ):
677                dtype = this[this.columns[loc]].dtypes.iloc[0]
678                subset = this[this.columns[loc]]
679
680                value_slice = dtype.construct_array_type()._concat_same_type(
681                    [x._values for _, x in subset.items()]
682                )
683                N, K = this.shape
684                idx = np.arange(N * K).reshape(K, N).T.ravel()
685                value_slice = value_slice.take(idx)
686
687            elif frame._is_mixed_type:
688                value_slice = this[this.columns[loc]].values
689            else:
690                value_slice = this.values[:, loc]
691
692        if value_slice.ndim > 1:
693            # i.e. not extension
694            value_slice = value_slice.ravel()
695
696        new_data[key] = value_slice
697
698    if len(drop_cols) > 0:
699        new_columns = new_columns.difference(drop_cols)
700
701    N = len(this)
702
703    if isinstance(this.index, MultiIndex):
704        new_levels = list(this.index.levels)
705        new_names = list(this.index.names)
706        new_codes = [lab.repeat(levsize) for lab in this.index.codes]
707    else:
708        old_codes, old_levels = factorize_from_iterable(this.index)
709        new_levels = [old_levels]
710        new_codes = [old_codes.repeat(levsize)]
711        new_names = [this.index.name]  # something better?
712
713    new_levels.append(level_vals)
714    new_codes.append(np.tile(level_codes, N))
715    new_names.append(frame.columns.names[level_num])
716
717    new_index = MultiIndex(
718        levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
719    )
720
721    result = frame._constructor(new_data, index=new_index, columns=new_columns)
722
723    # more efficient way to go about this? can do the whole masking biz but
724    # will only save a small amount of time...
725    if dropna:
726        result = result.dropna(axis=0, how="all")
727
728    return result
729
730
731def get_dummies(
732    data,
733    prefix=None,
734    prefix_sep="_",
735    dummy_na=False,
736    columns=None,
737    sparse=False,
738    drop_first=False,
739    dtype=None,
740) -> "DataFrame":
741    """
742    Convert categorical variable into dummy/indicator variables.
743
744    Parameters
745    ----------
746    data : array-like, Series, or DataFrame
747        Data of which to get dummy indicators.
748    prefix : str, list of str, or dict of str, default None
749        String to append DataFrame column names.
750        Pass a list with length equal to the number of columns
751        when calling get_dummies on a DataFrame. Alternatively, `prefix`
752        can be a dictionary mapping column names to prefixes.
753    prefix_sep : str, default '_'
754        If appending prefix, separator/delimiter to use. Or pass a
755        list or dictionary as with `prefix`.
756    dummy_na : bool, default False
757        Add a column to indicate NaNs, if False NaNs are ignored.
758    columns : list-like, default None
759        Column names in the DataFrame to be encoded.
760        If `columns` is None then all the columns with
761        `object` or `category` dtype will be converted.
762    sparse : bool, default False
763        Whether the dummy-encoded columns should be backed by
764        a :class:`SparseArray` (True) or a regular NumPy array (False).
765    drop_first : bool, default False
766        Whether to get k-1 dummies out of k categorical levels by removing the
767        first level.
768    dtype : dtype, default np.uint8
769        Data type for new columns. Only a single dtype is allowed.
770
771    Returns
772    -------
773    DataFrame
774        Dummy-coded data.
775
776    See Also
777    --------
778    Series.str.get_dummies : Convert Series to dummy codes.
779
780    Examples
781    --------
782    >>> s = pd.Series(list('abca'))
783
784    >>> pd.get_dummies(s)
785       a  b  c
786    0  1  0  0
787    1  0  1  0
788    2  0  0  1
789    3  1  0  0
790
791    >>> s1 = ['a', 'b', np.nan]
792
793    >>> pd.get_dummies(s1)
794       a  b
795    0  1  0
796    1  0  1
797    2  0  0
798
799    >>> pd.get_dummies(s1, dummy_na=True)
800       a  b  NaN
801    0  1  0    0
802    1  0  1    0
803    2  0  0    1
804
805    >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
806    ...                    'C': [1, 2, 3]})
807
808    >>> pd.get_dummies(df, prefix=['col1', 'col2'])
809       C  col1_a  col1_b  col2_a  col2_b  col2_c
810    0  1       1       0       0       1       0
811    1  2       0       1       1       0       0
812    2  3       1       0       0       0       1
813
814    >>> pd.get_dummies(pd.Series(list('abcaa')))
815       a  b  c
816    0  1  0  0
817    1  0  1  0
818    2  0  0  1
819    3  1  0  0
820    4  1  0  0
821
822    >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
823       b  c
824    0  0  0
825    1  1  0
826    2  0  1
827    3  0  0
828    4  0  0
829
830    >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
831         a    b    c
832    0  1.0  0.0  0.0
833    1  0.0  1.0  0.0
834    2  0.0  0.0  1.0
835    """
836    from pandas.core.reshape.concat import concat
837
838    dtypes_to_encode = ["object", "category"]
839
840    if isinstance(data, DataFrame):
841        # determine columns being encoded
842        if columns is None:
843            data_to_encode = data.select_dtypes(include=dtypes_to_encode)
844        elif not is_list_like(columns):
845            raise TypeError("Input must be a list-like for parameter `columns`")
846        else:
847            data_to_encode = data[columns]
848
849        # validate prefixes and separator to avoid silently dropping cols
850        def check_len(item, name):
851
852            if is_list_like(item):
853                if not len(item) == data_to_encode.shape[1]:
854                    len_msg = (
855                        f"Length of '{name}' ({len(item)}) did not match the "
856                        "length of the columns being encoded "
857                        f"({data_to_encode.shape[1]})."
858                    )
859                    raise ValueError(len_msg)
860
861        check_len(prefix, "prefix")
862        check_len(prefix_sep, "prefix_sep")
863
864        if isinstance(prefix, str):
865            prefix = itertools.cycle([prefix])
866        if isinstance(prefix, dict):
867            prefix = [prefix[col] for col in data_to_encode.columns]
868
869        if prefix is None:
870            prefix = data_to_encode.columns
871
872        # validate separators
873        if isinstance(prefix_sep, str):
874            prefix_sep = itertools.cycle([prefix_sep])
875        elif isinstance(prefix_sep, dict):
876            prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
877
878        with_dummies: List[DataFrame]
879        if data_to_encode.shape == data.shape:
880            # Encoding the entire df, do not prepend any dropped columns
881            with_dummies = []
882        elif columns is not None:
883            # Encoding only cols specified in columns. Get all cols not in
884            # columns to prepend to result.
885            with_dummies = [data.drop(columns, axis=1)]
886        else:
887            # Encoding only object and category dtype columns. Get remaining
888            # columns to prepend to result.
889            with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
890
891        for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep):
892            # col is (column_name, column), use just column data here
893            dummy = _get_dummies_1d(
894                col[1],
895                prefix=pre,
896                prefix_sep=sep,
897                dummy_na=dummy_na,
898                sparse=sparse,
899                drop_first=drop_first,
900                dtype=dtype,
901            )
902            with_dummies.append(dummy)
903        result = concat(with_dummies, axis=1)
904    else:
905        result = _get_dummies_1d(
906            data,
907            prefix,
908            prefix_sep,
909            dummy_na,
910            sparse=sparse,
911            drop_first=drop_first,
912            dtype=dtype,
913        )
914    return result
915
916
917def _get_dummies_1d(
918    data,
919    prefix,
920    prefix_sep="_",
921    dummy_na=False,
922    sparse=False,
923    drop_first=False,
924    dtype=None,
925):
926    from pandas.core.reshape.concat import concat
927
928    # Series avoids inconsistent NaN handling
929    codes, levels = factorize_from_iterable(Series(data))
930
931    if dtype is None:
932        dtype = np.uint8
933    dtype = np.dtype(dtype)
934
935    if is_object_dtype(dtype):
936        raise ValueError("dtype=object is not a valid dtype for get_dummies")
937
938    def get_empty_frame(data) -> DataFrame:
939        if isinstance(data, Series):
940            index = data.index
941        else:
942            index = np.arange(len(data))
943        return DataFrame(index=index)
944
945    # if all NaN
946    if not dummy_na and len(levels) == 0:
947        return get_empty_frame(data)
948
949    codes = codes.copy()
950    if dummy_na:
951        codes[codes == -1] = len(levels)
952        levels = np.append(levels, np.nan)
953
954    # if dummy_na, we just fake a nan level. drop_first will drop it again
955    if drop_first and len(levels) == 1:
956        return get_empty_frame(data)
957
958    number_of_cols = len(levels)
959
960    if prefix is None:
961        dummy_cols = levels
962    else:
963        dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels]
964
965    index: Optional[Index]
966    if isinstance(data, Series):
967        index = data.index
968    else:
969        index = None
970
971    if sparse:
972
973        fill_value: Union[bool, float, int]
974        if is_integer_dtype(dtype):
975            fill_value = 0
976        elif dtype == bool:
977            fill_value = False
978        else:
979            fill_value = 0.0
980
981        sparse_series = []
982        N = len(data)
983        sp_indices: List[List] = [[] for _ in range(len(dummy_cols))]
984        mask = codes != -1
985        codes = codes[mask]
986        n_idx = np.arange(N)[mask]
987
988        for ndx, code in zip(n_idx, codes):
989            sp_indices[code].append(ndx)
990
991        if drop_first:
992            # remove first categorical level to avoid perfect collinearity
993            # GH12042
994            sp_indices = sp_indices[1:]
995            dummy_cols = dummy_cols[1:]
996        for col, ixs in zip(dummy_cols, sp_indices):
997            sarr = SparseArray(
998                np.ones(len(ixs), dtype=dtype),
999                sparse_index=IntIndex(N, ixs),
1000                fill_value=fill_value,
1001                dtype=dtype,
1002            )
1003            sparse_series.append(Series(data=sarr, index=index, name=col))
1004
1005        out = concat(sparse_series, axis=1, copy=False)
1006        return out
1007
1008    else:
1009        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)
1010
1011        if not dummy_na:
1012            # reset NaN GH4446
1013            dummy_mat[codes == -1] = 0
1014
1015        if drop_first:
1016            # remove first GH12042
1017            dummy_mat = dummy_mat[:, 1:]
1018            dummy_cols = dummy_cols[1:]
1019        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
1020
1021
1022def _reorder_for_extension_array_stack(arr, n_rows: int, n_columns: int):
1023    """
1024    Re-orders the values when stacking multiple extension-arrays.
1025
1026    The indirect stacking method used for EAs requires a followup
1027    take to get the order correct.
1028
1029    Parameters
1030    ----------
1031    arr : ExtensionArray
1032    n_rows, n_columns : int
1033        The number of rows and columns in the original DataFrame.
1034
1035    Returns
1036    -------
1037    taken : ExtensionArray
1038        The original `arr` with elements re-ordered appropriately
1039
1040    Examples
1041    --------
1042    >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
1043    >>> _reorder_for_extension_array_stack(arr, 2, 3)
1044    array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
1045
1046    >>> _reorder_for_extension_array_stack(arr, 3, 2)
1047    array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
1048    """
1049    # final take to get the order correct.
1050    # idx is an indexer like
1051    # [c0r0, c1r0, c2r0, ...,
1052    #  c0r1, c1r1, c2r1, ...]
1053    idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
1054    return arr.take(idx)
1055