Pandas - 数据清洗 Pt.2

这篇随笔主要介绍如何利用 Pandas 对数据进行清洗
Pt.1 部分主要介绍利用 Pandas 处理缺失数据
Pt.2 部分详细介绍利用 Pandas 处理重复数据、替换数据和划分数据
Pt.3 部分主要介绍利用 Pandas 与 正则表达式的结合

1
2
import pandas as pd
import numpy as np

处理重复数据

obj.duplicated( columns, keep ), obj.drop_duplicates( columns, keep ) : 移除重复数据

1
2
3
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 
'k2': [1, 1, 2, 2, 3, 3, 3]})
data
k1k2
0one1
1two1
2one2
3two2
4one3
5two3
6two3
1
data.duplicated()
1
2
3
4
5
6
7
8
0    False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
1
data.drop_duplicates()
k1k2
0one1
1two1
2one2
3two2
4one3
5two3

columns : 指定部分列进行重复项判断/过滤

1
data.duplicated(['k1'])
1
2
3
4
5
6
7
8
0    False
1 False
2 True
3 True
4 True
5 True
6 True
dtype: bool
1
data.drop_duplicates(['k2'])
k1k2
0one1
2one2
4one3

keep : keep = 'first' / keep = 'last' , 保留项

1
data.duplicated(['k1'], keep='last')
1
2
3
4
5
6
7
8
0     True
1 True
2 True
3 True
4 False
5 True
6 False
dtype: bool
1
data.drop_duplicates(['k2'], keep='last')
k1k2
1two1
3two2
6two3

数据映射与替换

series.map( arg ) : 利用函数或字典进行数据映射

1
2
3
4
5
data = pd.DataFrame({'food': ['Bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham','nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
foodounces
0Bacon4.0
1pulled pork3.0
2bacon12.0
3Pastrami6.0
4corned beef7.5
5Bacon8.0
6pastrami3.0
7honey ham5.0
8nova lox6.0

映射数据

1
2
3
4
5
6
7
meat_to_animal = {'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'}
meat_to_animal
1
2
3
4
5
6
{'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'}

series.map( dict )

1
2
data['animal1'] = data['food'].str.lower().map(meat_to_animal)
data
foodouncesanimal1
0Bacon4.0pig
1pulled pork3.0pig
2bacon12.0pig
3Pastrami6.0cow
4corned beef7.5cow
5Bacon8.0pig
6pastrami3.0cow
7honey ham5.0pig
8nova lox6.0salmon

series.map( func )

1
2
data['animal2'] = data['food'].map( lambda x: meat_to_animal[x.lower()] )
data
foodouncesanimal1animal2
0Bacon4.0pigpig
1pulled pork3.0pigpig
2bacon12.0pigpig
3Pastrami6.0cowcow
4corned beef7.5cowcow
5Bacon8.0pigpig
6pastrami3.0cowcow
7honey ham5.0pigpig
8nova lox6.0salmonsalmon

obj.replace( to_replace, value ) : 数据替换

1
2
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace([-999, -1000], np.nan)
1
2
3
4
5
6
7
0    1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
1
data.replace([-999, -1000], [np.nan, 0])
1
2
3
4
5
6
7
0    1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
1
data.replace({-999: np.nan, -1000: 0})
1
2
3
4
5
6
7
0    1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64

obj.rename( index, columns, inplace ) : 索引重命名

1
2
3
4
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
onetwothreefour
Ohio0123
Colorado4567
New York891011

series.map( func ) : map 方法

1
2
data.index = data.index.map( lambda x: x[:4].upper() )
data
onetwothreefour
OHIO0123
COLO4567
NEW891011

series.rename( index, columns ) : rename 方法

1
data.rename(index=str.title, columns=str.upper)
ONETWOTHREEFOUR
Ohio0123
Colo4567
New891011

rename 可以结合字典型对象实现对部分轴标签的更新

1
data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'})
onetwopeekaboofour
INDIANA0123
COLO4567
NEW891011

数据划分

pd.cut( x, bins, right, labels ) : 划分面元( binning )

x : The input array to be binned; must be 1-D
bins : 面元, 可以是确切的面元边界, 也可以是面元数量
right : True : ( , ] ; False : [ , )
labels : 设置面元的名称

bins = list : 直接指定面元

1
2
3
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins, right=False)
1
cats.categories # 展示了划分的面元
1
IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)], dtype='interval[int64, left]')
1
cats.codes 
1
array([0, 0, 1, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
1
pd.value_counts(cats)
1
2
3
4
5
[18, 25)     4
[25, 35) 4
[35, 60) 3
[60, 100) 1
dtype: int64
1
2
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
1
2
3
['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

bins = n : 根据样本的 最小值和最大值 计算等长的面元

1
2
data = np.random.randint(0, 11, (50))
pd.cut(data, 5)
1
2
3
[(-0.01, 2.0], (-0.01, 2.0], (-0.01, 2.0], (4.0, 6.0], (-0.01, 2.0], ..., (6.0, 8.0], (8.0, 10.0], (-0.01, 2.0], (4.0, 6.0], (-0.01, 2.0]]
Length: 50
Categories (5, interval[float64, right]): [(-0.01, 2.0] < (2.0, 4.0] < (4.0, 6.0] < (6.0, 8.0] < (8.0, 10.0]]

pd.qcut( x, q, labels ) : 根据分位数划分面元( quantile binning )

x : The input array to be binned; must be 1-D
q : Number of quantiles, 分位数
labels : 设置面元的名称

q = n : 根据样本的 分位数 对数据进行面元划分

1
2
3
data = np.random.randint(0, 101, (1000))
cats = pd.qcut(data, 4)
cats
1
2
3
[(50.5, 76.0], (26.0, 50.5], (-0.001, 26.0], (50.5, 76.0], (26.0, 50.5], ..., (-0.001, 26.0], (26.0, 50.5], (76.0, 100.0], (26.0, 50.5], (26.0, 50.5]]
Length: 1000
Categories (4, interval[float64, right]): [(-0.001, 26.0] < (26.0, 50.5] < (50.5, 76.0] < (76.0, 100.0]]
1
pd.value_counts(cats)
1
2
3
4
5
(50.5, 76.0]      260
(-0.001, 26.0] 255
(26.0, 50.5] 245
(76.0, 100.0] 240
dtype: int64

q = list : 传递自定义的分位数(0到1之间的数值, 包含端点)

1
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
1
2
3
[(50.5, 91.0], (11.0, 50.5], (11.0, 50.5], (50.5, 91.0], (11.0, 50.5], ..., (11.0, 50.5], (11.0, 50.5], (50.5, 91.0], (11.0, 50.5], (11.0, 50.5]]
Length: 1000
Categories (4, interval[float64, right]): [(-0.001, 11.0] < (11.0, 50.5] < (50.5, 91.0] < (91.0, 100.0]]