Pandas - 语法基础 Pt.4

这篇随笔主要介绍 Pandas 的两种数据类型:Series 和 DataFrame 相关的内容
Pt.1 部分主要介绍对 Pandas 两种数据类型的基本操作,包括创建、索引和修改
Pt.2 部分详细介绍了 Pandas 的索引操作
Pt.3 部分主要介绍了 Pandas 的计算和函数
Pt.4 部分详细介绍了 Pandas 的统计计算相关的函数

1
2
3
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

统计计算

1
2
3
4
frame = pd.DataFrame([[1, np.nan], [2, 4],
[np.nan, np.nan], [3, 5]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])

obj.sum( axis, skipna, level ) : 求和

1
frame
onetwo
a1.0NaN
b2.04.0
cNaNNaN
d3.05.0
1
frame.sum()
1
2
3
one    6.0
two 9.0
dtype: float64
1
frame.sum(axis=1)
1
2
3
4
5
a    1.0
b 6.0
c 0.0
d 8.0
dtype: float64
1
frame.sum(axis=1, skipna=False)
1
2
3
4
5
a    NaN
b 6.0
c NaN
d 8.0
dtype: float64

obj.mean( axis, skipna, level ) : 求平均数

1
frame.mean()
1
2
3
one    2.0
two 4.5
dtype: float64

obj.median( axis, skipna, level ) : 求中位数

1
frame.median()
1
2
3
one    2.0
two 4.5
dtype: float64

obj.mad( axis, skipna, level ) : 求绝对离差

1
frame.mad()
1
2
3
one    0.666667
two 0.500000
dtype: float64

obj.var( axis, skipna, level ) : 求方差

1
frame.var()
1
2
3
one    1.0
two 0.5
dtype: float64

obj.std( axis, skipna, level ) : 求标准差

1
frame.std()
1
2
3
one    1.000000
two 0.707107
dtype: float64

obj.skew( axis, skipna, level ) : 求偏度

1
frame.skew()
1
2
3
one    0.0
two NaN
dtype: float64

obj.kurt( axis, skipna, level ) : 求峰度

1
pd.DataFrame([1,2,3,4,3,2,1,0]).kurt()
1
2
0   -0.7
dtype: float64

obj.quantile( q, axis ) : 求分位数(二分位、四分位、四分之三分位 ... )

1
frame, frame.quantile(0.5), frame.mean()
1
2
3
4
5
6
7
8
9
10
11
(   one  two
a 1.0 NaN
b 2.0 4.0
c NaN NaN
d 3.0 5.0,
one 2.0
two 4.5
Name: 0.5, dtype: float64,
one 2.0
two 4.5
dtype: float64)
1
frame.quantile(0.75) - frame.quantile(0.25) ### 离差IQR
1
2
3
one    1.0
two 0.5
dtype: float64

obj.cumsum( axis, skipna, level ), obj.cumprod( axis, skipna, level ) : 累加, 累乘

1
frame
onetwo
a1.0NaN
b2.04.0
cNaNNaN
d3.05.0
1
frame.cumsum()
onetwo
a1.0NaN
b3.04.0
cNaNNaN
d6.09.0
1
frame.cumprod()
onetwo
a1.0NaN
b2.04.0
cNaNNaN
d6.020.0

obj.cummin( axis, skipna, level ), obj.cummax( axis, skipna, level ) : 累计最小值和累计最大值

1
frame.cummax()
onetwo
a1.0NaN
b2.04.0
cNaNNaN
d3.05.0

obj.idmax( axis, skipna, level ), obj.idmin( axis, skipna, level ) : 返回最大值或最小值的索引

1
frame
onetwo
a1.0NaN
b2.04.0
cNaNNaN
d3.05.0
1
frame.idxmax()
1
2
3
one    d
two d
dtype: object
1
frame.idxmax(axis=1)
1
2
3
4
5
a    one
b two
c NaN
d two
dtype: object

obj.describe( ) : 返回多个汇总统计

1
frame
onetwo
a1.0NaN
b2.04.0
cNaNNaN
d3.05.0
1
frame.describe()
onetwo
count3.02.000000
mean2.04.500000
std1.00.707107
min1.04.000000
25%1.54.250000
50%2.04.500000
75%2.54.750000
max3.05.000000

对于非数值类型

1
2
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.head(), obj.describe()
1
2
3
4
5
6
7
8
9
10
11
(0    a
1 a
2 b
3 c
4 a
dtype: object,
count 16
unique 3
top a
freq 8
dtype: object)

series1.cov( series2 ), frame.cov( ) : 计算协方差

1
2
3
4
5
# 数据准备
price = pd.read_pickle('pydata-book-2nd-edition/examples/yahoo_price.pkl') #股票价格
volume = pd.read_pickle('pydata-book-2nd-edition/examples/yahoo_volume.pkl') #股票成交量
returns = price.pct_change() #计算变化率 : (后一个值 - 前一个值) / 前一个值
returns
AAPLGOOGIBMMSFT
Date
2010-01-04NaNNaNNaNNaN
2010-01-050.001729-0.004404-0.0120800.000323
2010-01-06-0.015906-0.025209-0.006496-0.006137
2010-01-07-0.001849-0.023280-0.003462-0.010400
2010-01-080.0066480.0133310.0100350.006897
...............
2016-10-17-0.0006800.0018370.002072-0.003483
2016-10-18-0.0006810.019616-0.0261680.007690
2016-10-19-0.0029790.0078460.003583-0.002255
2016-10-20-0.000512-0.0056520.001719-0.004867
2016-10-21-0.0039300.003011-0.0124740.042096

1714 rows × 4 columns

1
returns['MSFT'].cov(returns['IBM'])

8.870655479703546e-05

1
returns.cov()
AAPLGOOGIBMMSFT
AAPL0.0002770.0001070.0000780.000095
GOOG0.0001070.0002510.0000780.000108
IBM0.0000780.0000780.0001460.000089
MSFT0.0000950.0001080.0000890.000215

series1.corr( series2 ), frame.corr( ) : 计算相关系数

1
returns['MSFT'].corr(returns['IBM'])
1
0.4997636114415114
1
returns.corr()
AAPLGOOGIBMMSFT
AAPL1.0000000.4079190.3868170.389695
GOOG0.4079191.0000000.4050990.465919
IBM0.3868170.4050991.0000000.499764
MSFT0.3896950.4659190.4997641.000000

obj1.corrwith( obj2, axis ) : 计算 obj1 与 obj2 对应行或列的相关系数

1
returns.corrwith(returns.IBM)
1
2
3
4
5
AAPL    0.386817
GOOG 0.405099
IBM 1.000000
MSFT 0.499764
dtype: float64
1
returns.corrwith(volume) # 一一对应
1
2
3
4
5
AAPL   -0.075565
GOOG -0.007067
IBM -0.204849
MSFT -0.092950
dtype: float64
1
returns.T.corrwith(volume.T, axis=1)
1
2
3
4
5
AAPL   -0.075565
GOOG -0.007067
IBM -0.204849
MSFT -0.092950
dtype: float64

series.unique( series2 ) : 得到 series 唯一值数组

1
2
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

obj.isin( list ) : 判断 obj 的元素是否在 list 中

1
2
3
4
5
frame = pd.DataFrame([[1, np.nan], [2, 4],
[np.nan, np.nan], [3, 5]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
frame, frame.isin([1, 2])
1
2
3
4
5
6
7
8
9
10
(   one  two
a 1.0 NaN
b 2.0 4.0
c NaN NaN
d 3.0 5.0,
one two
a True False
b True False
c False False
d False False)

obj.value_counts( normalize, sort ) : 计算 obj 中各值出现的频率

1
2
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj.value_counts()
1
2
3
4
5
c    3
a 3
b 2
d 1
dtype: int64
1
2
3
4
5
frame = pd.DataFrame([[1, 3, 5], [2, 4, 6],
[1, 3, 6], [2, 4, 6]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two', 'thr'])
frame, frame.value_counts()
1
2
3
4
5
6
7
8
9
10
(   one  two  thr
a 1 3 5
b 2 4 6
c 1 3 6
d 2 4 6,
one two thr
2 4 6 2
1 3 5 1
6 1
dtype: int64)