文章目录

import numpy as np
import pandas as pd

1 pandas 数值计算和统计基础

1.1 常用数学,统计方法

1.1.1 axis,skipna

df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
                 'key2':[1,2,np.nan,4,5],
                 'key3':[1,2,3,'j','k']},
                 index = ['a','b','c','d','e'])
df
# 计算一列平均值
df.mean(axis=0)
key1    3.5
key2    3.0
dtype: float64
# 计算行平均值
df.mean(axis=1)
a    2.5
b    3.5
c    3.0
d    4.0
e    3.5
dtype: float64
df.mean(skipna=False)# skipna参数:是否忽略NaN,默认True,如False,有NaN的列统计结果仍未NaN
key1   NaN
key2   NaN
dtype: float64

1.1.2 数学计算方法,使用与Series和DataFrame

df = pd.DataFrame({'key1':np.arange(10),
                  'key2':np.random.rand(10)*10})
df
df.count()# 统计非Na值的数量
key1    10
key2    10
dtype: int64
df.min() # 求最小值
key1    0.000000
key2    0.234182
dtype: float64
df.max()# 求最大值
key1    9.000000
key2    9.813665
dtype: float64
df.sum()
key1    45.00000
key2    43.25315
dtype: float64
df.mean()
key1    4.500000
key2    4.325315
dtype: float64
df.median()
key1    4.500000
key2    3.743325
dtype: float64
# std()
# skew()
# kurt()
# mode()

1.1.3 数学计算方法

df['key1_s'] = df['key1'].cumsum()# 累加
df['key2_s'] = df['key2'].cumsum()
df
df['key1_p'] = df['key1'].cumprod()# 累乘
df['key2_p'] = df['key2'].cumprod()
df

1.1.4 唯一值,值计数,成员资格

s = pd.Series(list('asdvasdcfgg'))
s
0     a
1     s
2     d
3     v
4     a
5     s
6     d
7     c
8     f
9     g
10    g
dtype: object
s.unique()# 得到一个唯一值数组
array(['a', 's', 'd', 'v', 'c', 'f', 'g'], dtype=object)
s.value_counts(sort=False)# 得到一个新的Series,计算出不同值出现的频率
a    2
d    2
v    1
c    1
s    2
f    1
g    2
dtype: int64
s = pd.Series(np.arange(10,15))
df = pd.DataFrame({'key1':list('asdcbvasd'),
                  'key2':np.arange(4,13)})
df
s
0    10
1    11
2    12
3    13
4    14
dtype: int32
s.isin([5,14])
0    False
1    False
2    False
3    False
4     True
dtype: bool
df.isin(['a','bc','10',8])

2 文本数据(Pandas针对字符串配备的一套方法,使其易于对数组的每个元素进行操作)

2.1 通过str访问,且自动排除丢失/NA值

s = pd.Series(['A','b','C','bbhello','123',np.nan,'hj'])
df = pd.DataFrame({'key1':list('abcdef'),
                  'key2':['hee','fv','w','hija','123',np.nan]})
df
s
0          A
1          b
2          C
3    bbhello
4        123
5        NaN
6         hj
dtype: object
s.str.count("b")
0    0.0
1    1.0
2    0.0
3    2.0
4    0.0
5    NaN
6    0.0
dtype: float64
df["key2"].str.upper()
0     HEE
1      FV
2       W
3    HIJA
4     123
5     NaN
Name: key2, dtype: object
df.columns = df.columns.str.upper()
df

2.2 字符串常用方法1 - lower,upper,len,startswith,endswith

s = pd.Series(['A','b','bbhello','123',np.nan])
s
0          A
1          b
2    bbhello
3        123
4        NaN
dtype: object
s.str.lower()# 小写
0          a
1          b
2    bbhello
3        123
4        NaN
dtype: object
s.str.upper()# 大写
0          A
1          B
2    BBHELLO
3        123
4        NaN
dtype: object
s.str.len() # 长度
0    1.0
1    1.0
2    7.0
3    3.0
4    NaN
dtype: float64
s.str.startswith("b")#判断起始是否为a
0    False
1     True
2     True
3    False
4      NaN
dtype: object
s.str.endswith("3")# 判断结束是否为3
0    False
1    False
2    False
3     True
4      NaN
dtype: object

2.3 字符串常用方法2 - strip

s = pd.Series([' jack', 'jill ', ' jesse ', 'frank'])
df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A ', ' Column B '],
                  index=range(3))
df
s
0       jack
1      jill
2     jesse
3      frank
dtype: object
s.str.strip()
0     jack
1     jill
2    jesse
3    frank
dtype: object
s.str.lstrip()
0      jack
1     jill
2    jesse
3     frank
dtype: object
s.str.strip()
0     jack
1     jill
2    jesse
3    frank
dtype: object
df.columns.str.strip()
df

2.4 字符串常用方法3 - replace

df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A  ', '  Column B  '],
                  index=range(3))
df
df.columns.str.replace(" ","-")

2.5 字符串常用方法4 - split,rsplit

s = pd.Series(['a,b,c','1,2,3',['a,,,c'],np.nan])
s
0      a,b,c
1      1,2,3
2    [a,,,c]
3        NaN
dtype: object
s.str.split(",")
0    [a, b, c]
1    [1, 2, 3]
2          NaN
3          NaN
dtype: object

2.6 字符串索引

s = pd.Series(['A','b','C','bbhello','123',np.nan,'hj'])
df = pd.DataFrame({'key1':list('abcdef'),
                  'key2':['hee','fv','w','hija','123',np.nan]})
df
s
0          A
1          b
2          C
3    bbhello
4        123
5        NaN
6         hj
dtype: object
s.str[0]
0      A
1      b
2      C
3      b
4      1
5    NaN
6      h
dtype: object
s.str[:2]
0      A
1      b
2      C
3     bb
4     12
5    NaN
6     hj
dtype: object
df["key2"].str[0]
0      h
1      f
2      w
3      h
4      1
5    NaN
Name: key2, dtype: object

3 合并 merge ,join(Pandas具有全功能的,高性能内存中连接操作,与SQL等关系数据库非常相似)

3.1 pd.merge(left, right, how=‘inner’, on=None, left_on=None, right_on=None,left_index=False, right_index=False, sort=True,suffixes=(’_x’, ‘_y’), copy=True, indicator=False)

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                    'key2': ['K0', 'K1', 'K0', 'K1'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                    'key2': ['K0', 'K0', 'K0', 'K0'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']})
df1
df2
df3
df4
# left:第一个df
# right:第二个df
# on:参考键
# 参数how → 合并方式(inner:默认,取交集;outer:取并集,数据缺失范围NaN;left:按照df3为参考合并,数据缺失范围NaN;right:按照df4为参考合并,数据缺失范围NaN)
# 参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键
# 参数 sort
pd.merge(df1,df2,on = "key")
pd.merge(df3, df4, on=['key1','key2'])
pd.merge(df3, df4,on=['key1','key2'], how = 'inner')
pd.merge(df3, df4, on=['key1','key2'], how = 'outer')
pd.merge(df3, df4, on=['key1','key2'], how = 'left')
pd.merge(df3, df4, on=['key1','key2'], how = 'right')
df1 = pd.DataFrame({'lkey':list('bbacaab'),
                   'data1':range(7)})
df2 = pd.DataFrame({'rkey':list('abd'),
                   'date2':range(3)})
df1
df2
pd.merge(df1, df2, left_on='lkey', right_on='rkey')
df1 = pd.DataFrame({'key':list('abcdfeg'),
                   'data1':range(7)})
df2 = pd.DataFrame({'date2':range(100,105)},
                  index = list('abcde'))
df1
df2
pd.merge(df1, df2, left_on='key', right_index=True)
df1 = pd.DataFrame({'key':list('bbacaab'),
                   'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({'key':list('abd'),
                   'date2':[11,2,33]})
df1
df2
pd.merge(df1,df2, on = 'key', how = 'outer')
pd.merge(df1,df2, on = 'key', sort=True, how = 'outer')

3.2 pd.join() → 直接通过索引链接

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])
left
right
left.join(right)

4 连接与修补(连接 - 沿轴执行连接操作)

4.1 pd.concat(objs, axis=0, join=‘outer’, join_axes=None, ignore_index=False,keys=None, levels=None, names=None, verify_integrity=False,copy=True)

s1 = pd.Series([1,2,3])
s2 = pd.Series([2,3,4])
s3 = pd.Series([1,2,3],index = ['a','c','h'])
s4 = pd.Series([2,3,4],index = ['b','e','d'])
pd.concat([s1,s2]) # 默认axis=0,行+行
0    1
1    2
2    3
0    2
1    3
2    4
dtype: int64
pd.concat([s3,s4]).sort_index()
a    1
b    2
c    2
d    4
e    3
h    3
dtype: int64
pd.concat([s3,s4], axis=1)
c:\users\caiwei\appdata\local\programs\python\python35\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.

To retain the current behavior and silence the warning, pass sort=False

  """Entry point for launching an IPython kernel.
s5 = pd.Series([1,2,3],index = ['a','b','c'])
s6 = pd.Series([2,3,4],index = ['b','c','d'])
pd.concat([s5,s6], axis= 1)
c:\users\caiwei\appdata\local\programs\python\python35\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.

To retain the current behavior and silence the warning, pass sort=False

  """Entry point for launching an IPython kernel.
pd.concat([s5,s6], axis= 1, join='inner')
pd.concat([s5,s6], axis= 1, join_axes=[['a','b','d']])# join_axes:指定联合的index
pd.concat([s5,s6], keys = ['one','two'])
one  a    1
     b    2
     c    3
two  b    2
     c    3
     d    4
dtype: int64
pd.concat([s5,s6], axis=1, keys = ['one','two'])
c:\users\caiwei\appdata\local\programs\python\python35\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.

To retain the current behavior and silence the warning, pass sort=False

  """Entry point for launching an IPython kernel.

4.2 修补 pd.combine_first()

df1 = pd.DataFrame([[np.nan, 3., 5.], [-4.6, np.nan, np.nan],[np.nan, 7., np.nan]])
df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5., 1.6, 4]],index=[1, 2])
df1
df2
df1.combine_first(df2)

5 去重集替换

5.1 去重 .duplicated

s = pd.Series([1,1,1,1,2,2,2,3,4,5,5,5,5])
s.duplicated()
0     False
1      True
2      True
3      True
4     False
5      True
6      True
7     False
8     False
9     False
10     True
11     True
12     True
dtype: bool
s[s.duplicated()==False]# 通过布尔判断,得到不重复的值
0    1
4    2
7    3
8    4
9    5
dtype: int64
s.drop_duplicates()#inplace参数:是否替换原值,默认False
0    1
4    2
7    3
8    4
9    5
dtype: int64

5.2 替换 .replace

s = pd.Series(list('ascaazsd'))
s
0    a
1    s
2    c
3    a
4    a
5    z
6    s
7    d
dtype: object
s.replace(['a','s'] ,np.nan)
0    NaN
1    NaN
2      c
3    NaN
4    NaN
5      z
6    NaN
7      d
dtype: object
s.replace({'a':'hello world!','s':123})

6 数据分组(分组统计 - groupby功能

① 根据某些条件将数据拆分成组

② 对每个组独立应用函数

③ 将结果合并到一个数据结构中

Dataframe在行(axis=0)或列(axis=1)上进行分组,将一个函数应用到各个分组并产生一个新值,然后函数执行结果被合并到最终的结果对象中。

df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs))

6.1 分组

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df
df.groupby('A').mean()
df.groupby(['A','B']).mean()

6.2 分组可迭代对象

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
df
list(df.groupby('X'))
[('A',    X  Y
  0  A  1
  2  A  3), ('B',    X  Y
  1  B  4
  3  B  2)]
for n,g in df.groupby('X'):
    print(n)
    print(g)
A
   X  Y
0  A  1
2  A  3
B
   X  Y
1  B  4
3  B  2
df.groupby(['X']).get_group('A')
df.groupby(['X']).get_group('B')
grouped = df.groupby(['X'])
grouped.groups
{'A': Int64Index([0, 2], dtype='int64'),
 'B': Int64Index([1, 3], dtype='int64')}
grouped.groups['A']
Int64Index([0, 2], dtype='int64')
grouped.size()# .size():查看分组后的长度
X
A    2
B    2
dtype: int64
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df
grouped = df.groupby(['A','B']).groups
grouped
{('bar', 'one'): Int64Index([1], dtype='int64'),
 ('bar', 'three'): Int64Index([3], dtype='int64'),
 ('bar', 'two'): Int64Index([5], dtype='int64'),
 ('foo', 'one'): Int64Index([0, 6], dtype='int64'),
 ('foo', 'three'): Int64Index([7], dtype='int64'),
 ('foo', 'two'): Int64Index([2, 4], dtype='int64')}
grouped[('foo', 'three')]
Int64Index([7], dtype='int64')

6.3 其他轴上分组

df = pd.DataFrame({'data1':np.random.rand(2),
                  'data2':np.random.rand(2),
                  'key1':['a','b'],
                  'key2':['one','two']})
df
for n,p in df.groupby(df.dtypes, axis=1):
    print(n)
    print(p)
float64
      data1     data2
0  0.811538  0.923935
1  0.301277  0.474025
object
  key1 key2
0    a  one
1    b  two

6.4 通过字典或者Series分组

df = pd.DataFrame(np.arange(16).reshape(4,4),
                  columns = ['a','b','c','d'])
df
mapping = {'a':'one','b':'one','c':'two','d':'two','e':'three'}
by_column = df.groupby(mapping, axis = 1)
by_column.sum()
s = pd.Series(mapping)
s
a      one
b      one
c      two
d      two
e    three
dtype: object
s.groupby(s).count()
one      2
three    1
two      2
dtype: int64

6.5 通过函数分组

df = pd.DataFrame(np.arange(16).reshape(4,4),
                  columns = ['a','b','c','d'],
                 index = ['abc','bcd','aa','b'])
df
df.groupby(len).sum()

6.6 分组计算函数方法

s = pd.Series([1, 2, 3, 10, 20, 30], index = [1, 2, 3, 1, 2, 3])
grouped = s.groupby(level=0)  # 唯一索引用.groupby(level=0),将同一个index的分为一组
print(grouped)
print(grouped.first(),'→ first:非NaN的第一个值\n')
print(grouped.last(),'→ last:非NaN的最后一个值\n')
print(grouped.sum(),'→ sum:非NaN的和\n')
print(grouped.mean(),'→ mean:非NaN的平均值\n')
print(grouped.median(),'→ median:非NaN的算术中位数\n')
print(grouped.count(),'→ count:非NaN的值\n')
print(grouped.min(),'→ min、max:非NaN的最小值、最大值\n')
print(grouped.std(),'→ std,var:非NaN的标准差和方差\n')
print(grouped.prod(),'→ prod:非NaN的积\n')
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000260B42E26A0>
1    1
2    2
3    3
dtype: int64 → first:非NaN的第一个值

1    10
2    20
3    30
dtype: int64 → last:非NaN的最后一个值

1    11
2    22
3    33
dtype: int64 → sum:非NaN的和

1     5.5
2    11.0
3    16.5
dtype: float64 → mean:非NaN的平均值

1     5.5
2    11.0
3    16.5
dtype: float64 → median:非NaN的算术中位数

1    2
2    2
3    2
dtype: int64 → count:非NaN的值

1    1
2    2
3    3
dtype: int64 → min、max:非NaN的最小值、最大值

1     6.363961
2    12.727922
3    19.091883
dtype: float64 → std,var:非NaN的标准差和方差

1    10
2    40
3    90
dtype: int64 → prod:非NaN的积

6.7 多函数计算:agg()

df = pd.DataFrame({'a':[1,1,2,2],
                  'b':np.random.rand(4),
                  'c':np.random.rand(4),
                  'd':np.random.rand(4),})
df
df.groupby('a').agg(['mean',np.sum])
df.groupby('a')['b'].agg({'result1':np.mean,'result2':np.sum})
c:\users\caiwei\appdata\local\programs\python\python35\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.

7 分组转换及一般性"拆分-应用-合并"

7.1 数据分组转换,transform

df = pd.DataFrame({'data1':np.random.rand(5),
                  'data2':np.random.rand(5),
                  'key1':list('aabba'),
                  'key2':['one','two','one','two','one']})
df
df.groupby('key2').transform(np.mean)

7.2 一般化Groupby方法:apply

df = pd.DataFrame({'data1':np.random.rand(5),
                  'data2':np.random.rand(5),
                  'key1':list('aabba'),
                  'key2':['one','two','one','two','one']})
df
df.groupby('key1').apply(lambda x: x.describe())

8 透视表及交叉表

8.1 透视表:pivot_table(pd.pivot_table(data, values=None, index=None, columns=None, aggfunc=‘mean’, fill_value=None, margins=False, dropna=True, margins_name=‘All’)

date = ['2017-5-1','2017-5-2','2017-5-3']*3
rng = pd.to_datetime(date)
df = pd.DataFrame({'date':rng,
                   'key':list('abcdabcda'),
                  'values':np.random.rand(9)*10})
df
# data:DataFrame对象
# values:要聚合的列或列的列表
# index:数据透视表的index,从原数据的列中筛选
# columns:数据透视表的columns,从原数据的列中筛选
# aggfunc:用于聚合的函数,默认为numpy.mean,支持numpy计算方法
pd.pivot_table(df, values = 'values', index = 'date', columns = 'key', aggfunc=np.sum)
pd.pivot_table(df, values = 'values', index = ['date','key'], aggfunc=len)

8.2 交叉表:crosstab,默认情况下,crosstab计算因子的频率表,比如用于str的数据透视分析(pd.crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True, normalize=False))

df = pd.DataFrame({'A': [1, 2, 2, 2, 2],
                   'B': [3, 3, 4, 4, 4],
                   'C': [1, 1, np.nan, 1, 1]})
df
pd.crosstab(df['A'],df['B'])
# 如果crosstab只接收两个Series,它将提供一个频率表。
# 用A的唯一值,统计B唯一值的出现次数
pd.crosstab(df['A'],df['B'],normalize=True)# normalize:默认Fa
10-07 17:39