Add Columns into a pandas DataFrame

Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!

Comment¶

When a Series is added as a new column into a DataFrame, values are added by matching index.

import pandas as pd

df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [5, 4, 3, 2, 1]})
df.head()

s = pd.Series([10, 20, 30, 40, 50], index=[4, 3, 2, 1, 0])
s

4    10
3    20
2    30
1    40
0    50
dtype: int64

df["z"] = s

df

s2 = pd.Series([10, 20, 30, 40, 50], index=[40, 30, 20, 10, 0])
s2

40    10
30    20
20    30
10    40
0     50
dtype: int64

df["s2"] = s2
df

df["s3"] = [10, 20, 30, 40, 50]
df

max(s, s2)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-17-fe58d3fe3214> in <module>()
----> 1 max(s, s2)

/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in wrapper(self, other, axis)
   1188 
   1189         elif isinstance(other, ABCSeries) and not self._indexed_same(other):
-> 1190             raise ValueError("Can only compare identically-labeled "
   1191                              "Series objects")
   1192 

ValueError: Can only compare identically-labeled Series objects

4    10
3    20
2    30
1    40
0    50
dtype: int64

s1 = pd.Series([1, 2, 3, 4, 5], index=[4, 3, 2, 1, 0])

s1

4    1
3    2
2    3
1    4
0    5
dtype: int64

max(s, s1)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-27-68c4d3885ee8> in <module>()
----> 1 max(s, s1)

/opt/conda/lib/python3.6/site-packages/pandas/core/generic.py in __nonzero__(self)
   1571         raise ValueError("The truth value of a {0} is ambiguous. "
   1572                          "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
-> 1573                          .format(self.__class__.__name__))
   1574 
   1575     __bool__ = __nonzero__

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

dir(s1)

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_SLICEMAP',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_prepare__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmatmul__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__round__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__unicode__',
 '__weakref__',
 '__xor__',
 '_accessors',
 '_add_numeric_operations',
 '_add_series_only_operations',
 '_add_series_or_dataframe_operations',
 '_agg_by_level',
 '_agg_doc',
 '_aggregate',
 '_aggregate_multiple_funcs',
 '_align_frame',
 '_align_series',
 '_binop',
 '_box_item_values',
 '_builtin_table',
 '_can_hold_na',
 '_check_inplace_setting',
 '_check_is_chained_assignment_possible',
 '_check_label_or_level_ambiguity',
 '_check_percentile',
 '_check_setitem_copy',
 '_clear_item_cache',
 '_clip_with_one_bound',
 '_clip_with_scalar',
 '_consolidate',
 '_consolidate_inplace',
 '_construct_axes_dict',
 '_construct_axes_dict_for_slice',
 '_construct_axes_dict_from',
 '_construct_axes_from_arguments',
 '_constructor',
 '_constructor_expanddim',
 '_constructor_sliced',
 '_convert',
 '_create_indexer',
 '_cython_table',
 '_deprecations',
 '_dir_additions',
 '_dir_deletions',
 '_drop_axis',
 '_drop_labels_or_levels',
 '_expand_axes',
 '_find_valid_index',
 '_formatting_values',
 '_from_axes',
 '_get_axis',
 '_get_axis_name',
 '_get_axis_number',
 '_get_axis_resolvers',
 '_get_block_manager_axis',
 '_get_bool_data',
 '_get_cacher',
 '_get_index_resolvers',
 '_get_item_cache',
 '_get_label_or_level_values',
 '_get_numeric_data',
 '_get_value',
 '_get_values',
 '_get_values_tuple',
 '_get_with',
 '_gotitem',
 '_iget_item_cache',
 '_index',
 '_indexed_same',
 '_info_axis',
 '_info_axis_name',
 '_info_axis_number',
 '_init_dict',
 '_init_mgr',
 '_internal_names',
 '_internal_names_set',
 '_is_builtin_func',
 '_is_cached',
 '_is_copy',
 '_is_cython_func',
 '_is_datelike_mixed_type',
 '_is_label_or_level_reference',
 '_is_label_reference',
 '_is_level_reference',
 '_is_mixed_type',
 '_is_numeric_mixed_type',
 '_is_view',
 '_ix',
 '_ixs',
 '_map_values',
 '_maybe_cache_changed',
 '_maybe_update_cacher',
 '_metadata',
 '_ndarray_values',
 '_needs_reindex_multi',
 '_obj_with_exclusions',
 '_protect_consolidate',
 '_reduce',
 '_reindex_axes',
 '_reindex_axis',
 '_reindex_indexer',
 '_reindex_multi',
 '_reindex_with_indexers',
 '_repr_data_resource_',
 '_repr_latex_',
 '_reset_cache',
 '_reset_cacher',
 '_selected_obj',
 '_selection',
 '_selection_list',
 '_selection_name',
 '_set_as_cached',
 '_set_axis',
 '_set_axis_name',
 '_set_is_copy',
 '_set_item',
 '_set_labels',
 '_set_name',
 '_set_subtyp',
 '_set_value',
 '_set_values',
 '_set_with',
 '_set_with_engine',
 '_setup_axes',
 '_shallow_copy',
 '_slice',
 '_stat_axis',
 '_stat_axis_name',
 '_stat_axis_number',
 '_take',
 '_to_dict_of_blocks',
 '_try_aggregate_string_function',
 '_typ',
 '_unpickle_series_compat',
 '_update_inplace',
 '_validate_dtype',
 '_values',
 '_where',
 '_xs',
 'abs',
 'add',
 'add_prefix',
 'add_suffix',
 'agg',
 'aggregate',
 'align',
 'all',
 'any',
 'append',
 'apply',
 'argmax',
 'argmin',
 'argsort',
 'as_matrix',
 'asfreq',
 'asof',
 'astype',
 'at',
 'at_time',
 'autocorr',
 'axes',
 'base',
 'between',
 'between_time',
 'bfill',
 'bool',
 'clip',
 'clip_lower',
 'clip_upper',
 'combine',
 'combine_first',
 'compound',
 'compress',
 'copy',
 'corr',
 'count',
 'cov',
 'cummax',
 'cummin',
 'cumprod',
 'cumsum',
 'data',
 'describe',
 'diff',
 'div',
 'divide',
 'divmod',
 'dot',
 'drop',
 'drop_duplicates',
 'dropna',
 'dtype',
 'dtypes',
 'duplicated',
 'empty',
 'eq',
 'equals',
 'ewm',
 'expanding',
 'factorize',
 'ffill',
 'fillna',
 'filter',
 'first',
 'first_valid_index',
 'flags',
 'floordiv',
 'from_array',
 'ftype',
 'ftypes',
 'ge',
 'get',
 'get_dtype_counts',
 'get_ftype_counts',
 'get_values',
 'groupby',
 'gt',
 'hasnans',
 'head',
 'hist',
 'iat',
 'idxmax',
 'idxmin',
 'iloc',
 'imag',
 'index',
 'infer_objects',
 'interpolate',
 'is_monotonic',
 'is_monotonic_decreasing',
 'is_monotonic_increasing',
 'is_unique',
 'isin',
 'isna',
 'isnull',
 'item',
 'items',
 'itemsize',
 'iteritems',
 'ix',
 'keys',
 'kurt',
 'kurtosis',
 'last',
 'last_valid_index',
 'le',
 'loc',
 'lt',
 'mad',
 'map',
 'mask',
 'max',
 'mean',
 'median',
 'memory_usage',
 'min',
 'mod',
 'mode',
 'mul',
 'multiply',
 'name',
 'nbytes',
 'ndim',
 'ne',
 'nlargest',
 'nonzero',
 'notna',
 'notnull',
 'nsmallest',
 'nunique',
 'pct_change',
 'pipe',
 'plot',
 'pop',
 'pow',
 'prod',
 'product',
 'ptp',
 'put',
 'quantile',
 'radd',
 'rank',
 'ravel',
 'rdiv',
 'real',
 'reindex',
 'reindex_axis',
 'reindex_like',
 'rename',
 'rename_axis',
 'reorder_levels',
 'repeat',
 'replace',
 'resample',
 'reset_index',
 'rfloordiv',
 'rmod',
 'rmul',
 'rolling',
 'round',
 'rpow',
 'rsub',
 'rtruediv',
 'sample',
 'searchsorted',
 'select',
 'sem',
 'set_axis',
 'shape',
 'shift',
 'size',
 'skew',
 'slice_shift',
 'sort_index',
 'sort_values',
 'squeeze',
 'std',
 'strides',
 'sub',
 'subtract',
 'sum',
 'swapaxes',
 'swaplevel',
 'tail',
 'take',
 'to_clipboard',
 'to_csv',
 'to_dense',
 'to_dict',
 'to_excel',
 'to_frame',
 'to_hdf',
 'to_json',
 'to_latex',
 'to_msgpack',
 'to_period',
 'to_pickle',
 'to_sparse',
 'to_sql',
 'to_string',
 'to_timestamp',
 'to_xarray',
 'tolist',
 'transform',
 'transpose',
 'truediv',
 'truncate',
 'tshift',
 'tz_convert',
 'tz_localize',
 'unique',
 'unstack',
 'update',
 'value_counts',
 'values',
 'var',
 'view',
 'where',
 'xs']

4    10
3    20
2    30
1    40
0    50
dtype: int64

s.clip(15, 35)

4    15
3    20
2    30
1    35
0    35
dtype: int64

s.clip_lower(15)

4    15
3    20
2    30
1    40
0    50
dtype: int64

4    10
3    20
2    30
1    40
0    50
dtype: int64

s2

40    10
30    20
20    30
10    40
0     50
dtype: int64

s.combine(s1, min)

4    1
3    2
2    3
1    4
0    5
dtype: int64

s.combine(s2, min, np.inf)

0     50
1     40
2     30
3     20
4     10
10    40
20    30
30    20
40    10
dtype: int64

import numpy as np

np.inf

inf

4    10
3    20
2    30
1    40
0    50
dtype: int64

s2

40    10
30    20
20    30
10    40
0     50
dtype: int64

np.minimum(s, s1)

4    1
3    2
2    3
1    4
0    5
dtype: int64

np.clip(pd.Series([1, 2, 3]), pd.Series([3, 4, 5]), None)

0    3
1    4
2    5
dtype: int64

?s.clip

Signature: s.clip(lower=None, upper=None, axis=None, inplace=False, *args, **kwargs)
Docstring:
Trim values at input threshold(s).

Assigns values outside boundary to boundary values. Thresholds
can be singular values or array like, and in the latter case
the clipping is performed element-wise in the specified axis.

Parameters
----------
lower : float or array_like, default None
    Minimum threshold value. All values below this
    threshold will be set to it.
upper : float or array_like, default None
    Maximum threshold value. All values above this
    threshold will be set to it.
axis : int or string axis name, optional
    Align object with lower and upper along the given axis.
inplace : boolean, default False
    Whether to perform the operation in place on the data.

    .. versionadded:: 0.21.0
*args, **kwargs
    Additional keywords have no effect but might be accepted
    for compatibility with numpy.

See Also
--------
clip_lower : Clip values below specified threshold(s).
clip_upper : Clip values above specified threshold(s).

Returns
-------
Series or DataFrame
    Same type as calling object with the values outside the
    clip boundaries replaced

Examples
--------
>>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}
>>> df = pd.DataFrame(data)
>>> df
   col_0  col_1
0      9     -2
1     -3     -7
2      0      6
3     -1      8
4      5     -5

Clips per column using lower and upper thresholds:

>>> df.clip(-4, 6)
   col_0  col_1
0      6     -2
1     -3     -4
2      0      6
3     -1      6
4      5     -4

Clips using specific lower and upper thresholds per column element:

>>> t = pd.Series([2, -4, -1, 6, 3])
>>> t
0    2
1   -4
2   -1
3    6
4    3
dtype: int64

>>> df.clip(t, t + 4, axis=0)
   col_0  col_1
0      6      2
1     -3     -4
2      0      3
3      6      8
4      5      3
File:      /opt/conda/lib/python3.6/site-packages/pandas/core/generic.py
Type:      method

s.clip_lower

4    10
3    20
2    30
1    40
0    50
dtype: int64

s1

4    1
3    2
2    3
1    4
0    5
dtype: int64