Pandas - 数据读写

这篇随笔主要介绍利用 Pandas 对 Excel 文件进行读写的操作

1
2
import pandas as pd
import numpy as np

读文件

pd.read_csv( path, sep, header, index_col, names, skiprows, na_values, nrows, chunksize, encoding )

path : 文件位置
sep : 对每行各字段进行拆分的字符序列
header : 用作列名的行号, 默认为 0 , 没有标题行设为 None
names : 用作设置列名的 list
index_col : 用作索引的列标签, 可以是单个或多个列名组成的 list
skiprows : 需要忽略的行号的 list
na_values : 用作替换缺失值 NaN 的 list 或 dict
nrows : 需要读取的行数
chunksize : 返回分块读取的迭代器
encoding : 用作 unicode 的文本编码格式

1
pd.read_csv('pydata-book-2nd-edition/examples/ex1.csv')
abcdmessage
01234hello
15678world
29101112foo

header : 处理没有标题行的文件

1
pd.read_csv('pydata-book-2nd-edition/examples/ex2.csv')
1234hello
05678world
19101112foo

header = None

1
pd.read_csv('pydata-book-2nd-edition/examples/ex2.csv', header=None)
01234
01234hello
15678world
29101112foo

names = list

1
pd.read_csv('pydata-book-2nd-edition/examples/ex2.csv', names=['q', 'w', 'e', 'r', 'message'])
qwermessage
01234hello
15678world
29101112foo

index_col : 设置索引

1
2
3
pd.read_csv('pydata-book-2nd-edition/examples/ex2.csv', 
names=['q', 'w', 'e', 'r', 'message'],
index_col='message')
qwer
message
hello1234
world5678
foo9101112

层次化索引

1
pd.read_csv('pydata-book-2nd-edition/examples/csv_mindex.csv')
key1key2value1value2
0onea12
1oneb34
2onec56
3oned78
4twoa910
5twob1112
6twoc1314
7twod1516
1
pd.read_csv('pydata-book-2nd-edition/examples/csv_mindex.csv', index_col=['key1', 'key2'])
value1value2
key1key2
onea12
b34
c56
d78
twoa910
b1112
c1314
d1516

skiprows : 跳行

1
list(open('pydata-book-2nd-edition/examples/ex4.csv'))
1
2
3
4
5
6
7
['# hey!\n',
'a,b,c,d,message\n',
'# just wanted to make things more difficult for you\n',
'# who reads CSV files with computers, anyway?\n',
'1,2,3,4,hello\n',
'5,6,7,8,world\n',
'9,10,11,12,foo']
1
2
pd.read_csv('pydata-book-2nd-edition/examples/ex4.csv', skiprows=[0, 2, 3])
# 跳过第1、3、4行
abcdmessage
01234hello
15678world
29101112foo

na_values : 处理缺失值

1
list(open('pydata-book-2nd-edition/examples/ex5.csv'))
1
2
3
4
['something,a,b,c,d,message\n',
'one,1,2,3,4,NA\n',
'two,5,6,,8,world\n',
'three,9,10,11,12,foo']
1
pd.read_csv('pydata-book-2nd-edition/examples/ex5.csv')
somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo

na_value = dic/list

1
pd.read_csv('pydata-book-2nd-edition/examples/ex5.csv', na_values=['one', 'two', 'three'])
somethingabcdmessage
0NaN123.04NaN
1NaN56NaN8world
2NaN91011.012foo
1
2
3
# 字典的各列可以使用不同的 NaN 标记值
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('pydata-book-2nd-edition/examples/ex5.csv', na_values=sentinels)
somethingabcdmessage
0one123.04NaN
1NaN56NaN8world
2three91011.012NaN

pd.read_table( path, sep, ... )

sep : 分隔符

1
pd.read_table('pydata-book-2nd-edition/examples/ex1.csv', sep=',')
abcdmessage
01234hello
15678world
29101112foo
1
list(open('pydata-book-2nd-edition/examples/ex3.txt'))
1
2
3
4
5
['            A         B         C\n',
'aaa -0.264438 -1.026059 -0.619500\n',
'bbb 0.927272 0.302904 -0.032399\n',
'ccc -0.264273 -0.386314 -0.217601\n',
'ddd -0.871858 -0.348382 1.100491\n']
1
2
pd.read_table('pydata-book-2nd-edition/examples/ex3.txt', sep='\s+')
# 由于列名比数据行的数量少, 所以 read_table 推断第一列应该是 DataFrame 的索引
ABC
aaa-0.264438-1.026059-0.619500
bbb0.9272720.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.3483821.100491

逐块读取文件

pd.read_csv( path, nrows )

1
pd.read_csv('pydata-book-2nd-edition/examples/ex6.csv', nrows=5)
onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q

pd.read_csv( path, chunksize ) : 迭代器

1
2
3
4
5
6
7
8
9
10
11
chunker = pd.read_csv('pydata-book-2nd-edition/examples/ex6.csv', chunksize=1000)
# 设置分块大小为1000行
tot = pd.Series([])
for piece in chunker:
# 1000行分块
tot = tot.add(piece['key'].value_counts(), fill_value=0)
# piece['key'].value_counts() : 返回一个 series
# 计算 key 中各值出现的次数

tot = tot.sort_values(ascending=False)
tot[:10]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
<ipython-input-19-c6c0008a71bc>:3: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
tot = pd.Series([])

E 368.0
X 364.0
L 346.0
O 343.0
Q 340.0
M 338.0
J 337.0
F 335.0
K 334.0
H 330.0
dtype: float64

写文件

frames.to_csv( path, sep, na_rep, index, header, columns )

1
2
data = pd.read_csv('pydata-book-2nd-edition/examples/ex5.csv')
data
somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
1
data.to_csv('pydata-book-2nd-edition/examples/out.csv')

sep : 分隔符号

1
2
data.to_csv(sys.stdout, sep='|' )
# 直接写出到 sys.stdout , 所以仅仅是打印出文本结果而已
1
2
3
4
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo

na_rep : 缺失值

1
data.to_csv(sys.stdout, na_rep='-9999')
1
2
3
4
,something,a,b,c,d,message
0,one,1,2,3.0,4,-9999
1,two,5,6,-9999,8,world
2,three,9,10,11.0,12,foo

index, header : 索引, 列标题的控制

1
data.to_csv(sys.stdout, index=False, header=False)
1
2
3
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo

columns : 输出指定的行或列

1
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])
1
2
3
4
a,b,c
1,2,3.0
5,6,
9,10,11.0

series.to_csv( path )

1
2
3
dates = pd.date_range('1/1/2000', periods=7)
obj = pd.Series(np.arange(7), index=dates)
obj.to_csv(sys.stdout)
1
2
3
4
5
6
7
8
,0
2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6

读写 Excel 文件

pd.ExcelFile( path ) + pd.read_excel( ExcelFile, sheet_name )

读取一个文件的多个 sheet 时, 先创建 ExcelFile 传递路径, 然后使用 read_excel 读取, 速度更快

1
2
xlsx = pd.ExcelFile('pydata-book-2nd-edition/examples/ex1.xlsx')
pd.read_excel(xlsx, 'Sheet1')
Unnamed: 0abcdmessage
001234hello
115678world
229101112foo

pd.read_excel( path, sheet_name )

直接传递文件路径到 pd.read_excel 中

1
pd.read_excel('pydata-book-2nd-edition/examples/ex1.xlsx', 'Sheet1')
Unnamed: 0abcdmessage
001234hello
115678world
229101112foo

pd.ExcelWriter( path ) + frame.to_excel( ExcelWriter, sheet_name ) + ExcelWriter.save( )

首先创建 ExcelWriter , 然后使用 to_excel 方法将数据写入到其中

1
2
3
4
frame = pd.read_excel('pydata-book-2nd-edition/examples/ex1.xlsx', 'Sheet1')
writer = pd.ExcelWriter('pydata-book-2nd-edition/examples/ex2.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()

frame.to_excel( path, sheet_name )

直接传递文件路径到 to_excel 中

1
frame.to_excel('pydata-book-2nd-edition/examples/ex2.xlsx', 'Sheet1')