Pandas - 数据清洗 Pt.2
这篇随笔主要介绍如何利用 Pandas 对数据进行清洗
Pt.1 部分主要介绍利用 Pandas 处理缺失数据
Pt.2 部分详细介绍利用 Pandas 处理重复数据、替换数据和划分数据
Pt.3 部分主要介绍利用 Pandas 与 正则表达式的结合
1 | import pandas as pd |
处理重复数据
obj.duplicated( columns, keep ), obj.drop_duplicates( columns, keep ) : 移除重复数据
1 | data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], |
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | two | 1 |
2 | one | 2 |
3 | two | 2 |
4 | one | 3 |
5 | two | 3 |
6 | two | 3 |
1 | data.duplicated() |
1 | 0 False |
1 | data.drop_duplicates() |
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | two | 1 |
2 | one | 2 |
3 | two | 2 |
4 | one | 3 |
5 | two | 3 |
columns : 指定部分列进行重复项判断/过滤
1 | data.duplicated(['k1']) |
1 | 0 False |
1 | data.drop_duplicates(['k2']) |
k1 | k2 | |
---|---|---|
0 | one | 1 |
2 | one | 2 |
4 | one | 3 |
keep : keep = 'first' / keep = 'last' , 保留项
1 | data.duplicated(['k1'], keep='last') |
1 | 0 True |
1 | data.drop_duplicates(['k2'], keep='last') |
k1 | k2 | |
---|---|---|
1 | two | 1 |
3 | two | 2 |
6 | two | 3 |
数据映射与替换
series.map( arg ) : 利用函数或字典进行数据映射
1 | data = pd.DataFrame({'food': ['Bacon', 'pulled pork', 'bacon', |
food | ounces | |
---|---|---|
0 | Bacon | 4.0 |
1 | pulled pork | 3.0 |
2 | bacon | 12.0 |
3 | Pastrami | 6.0 |
4 | corned beef | 7.5 |
5 | Bacon | 8.0 |
6 | pastrami | 3.0 |
7 | honey ham | 5.0 |
8 | nova lox | 6.0 |
映射数据
1 | meat_to_animal = {'bacon': 'pig', |
1 | {'bacon': 'pig', |
series.map( dict )
1 | data['animal1'] = data['food'].str.lower().map(meat_to_animal) |
food | ounces | animal1 | |
---|---|---|---|
0 | Bacon | 4.0 | pig |
1 | pulled pork | 3.0 | pig |
2 | bacon | 12.0 | pig |
3 | Pastrami | 6.0 | cow |
4 | corned beef | 7.5 | cow |
5 | Bacon | 8.0 | pig |
6 | pastrami | 3.0 | cow |
7 | honey ham | 5.0 | pig |
8 | nova lox | 6.0 | salmon |
series.map( func )
1 | data['animal2'] = data['food'].map( lambda x: meat_to_animal[x.lower()] ) |
food | ounces | animal1 | animal2 | |
---|---|---|---|---|
0 | Bacon | 4.0 | pig | pig |
1 | pulled pork | 3.0 | pig | pig |
2 | bacon | 12.0 | pig | pig |
3 | Pastrami | 6.0 | cow | cow |
4 | corned beef | 7.5 | cow | cow |
5 | Bacon | 8.0 | pig | pig |
6 | pastrami | 3.0 | cow | cow |
7 | honey ham | 5.0 | pig | pig |
8 | nova lox | 6.0 | salmon | salmon |
obj.replace( to_replace, value ) : 数据替换
1 | data = pd.Series([1., -999., 2., -999., -1000., 3.]) |
1 | 0 1.0 |
1 | data.replace([-999, -1000], [np.nan, 0]) |
1 | 0 1.0 |
1 | data.replace({-999: np.nan, -1000: 0}) |
1 | 0 1.0 |
obj.rename( index, columns, inplace ) : 索引重命名
1 | data = pd.DataFrame(np.arange(12).reshape((3, 4)), |
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
New York | 8 | 9 | 10 | 11 |
series.map( func ) : map 方法
1 | data.index = data.index.map( lambda x: x[:4].upper() ) |
one | two | three | four | |
---|---|---|---|---|
OHIO | 0 | 1 | 2 | 3 |
COLO | 4 | 5 | 6 | 7 |
NEW | 8 | 9 | 10 | 11 |
series.rename( index, columns ) : rename 方法
1 | data.rename(index=str.title, columns=str.upper) |
ONE | TWO | THREE | FOUR | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colo | 4 | 5 | 6 | 7 |
New | 8 | 9 | 10 | 11 |
rename 可以结合字典型对象实现对部分轴标签的更新
1 | data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'}) |
one | two | peekaboo | four | |
---|---|---|---|---|
INDIANA | 0 | 1 | 2 | 3 |
COLO | 4 | 5 | 6 | 7 |
NEW | 8 | 9 | 10 | 11 |
数据划分
pd.cut( x, bins, right, labels ) : 划分面元( binning )
x : The input array to be binned; must be 1-D
bins : 面元, 可以是确切的面元边界, 也可以是面元数量
right : True : ( , ] ; False : [ , )
labels : 设置面元的名称
bins = list : 直接指定面元
1 | ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] |
1 | cats.categories # 展示了划分的面元 |
1 | IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)], dtype='interval[int64, left]') |
1 | cats.codes |
1 | array([0, 0, 1, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8) |
1 | pd.value_counts(cats) |
1 | [18, 25) 4 |
1 | group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior'] |
1 | ['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult'] |
bins = n : 根据样本的 最小值和最大值 计算等长的面元
1 | data = np.random.randint(0, 11, (50)) |
1 | [(-0.01, 2.0], (-0.01, 2.0], (-0.01, 2.0], (4.0, 6.0], (-0.01, 2.0], ..., (6.0, 8.0], (8.0, 10.0], (-0.01, 2.0], (4.0, 6.0], (-0.01, 2.0]] |
pd.qcut( x, q, labels ) : 根据分位数划分面元( quantile binning )
x : The input array to be binned; must be 1-D
q : Number of quantiles, 分位数
labels : 设置面元的名称
q = n : 根据样本的 分位数 对数据进行面元划分
1 | data = np.random.randint(0, 101, (1000)) |
1 | [(50.5, 76.0], (26.0, 50.5], (-0.001, 26.0], (50.5, 76.0], (26.0, 50.5], ..., (-0.001, 26.0], (26.0, 50.5], (76.0, 100.0], (26.0, 50.5], (26.0, 50.5]] |
1 | pd.value_counts(cats) |
1 | (50.5, 76.0] 260 |
q = list : 传递自定义的分位数(0到1之间的数值, 包含端点)
1 | pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]) |
1 | [(50.5, 91.0], (11.0, 50.5], (11.0, 50.5], (50.5, 91.0], (11.0, 50.5], ..., (11.0, 50.5], (11.0, 50.5], (50.5, 91.0], (11.0, 50.5], (11.0, 50.5]] |