这篇随笔主要介绍利用 Pandas 对 Excel 文件进行读写的操作
1 2 import pandas as pdimport numpy as np
读文件 path : 文件位置sep : 对每行各字段进行拆分的字符序列header : 用作列名的行号, 默认为 0 , 没有标题行设为 Nonenames : 用作设置列名的 listindex_col : 用作索引的列标签, 可以是单个或多个列名组成的 listskiprows : 需要忽略的行号的 listna_values : 用作替换缺失值 NaN 的 list 或 dictnrows : 需要读取的行数chunksize : 返回分块读取的迭代器encoding : 用作 unicode 的文本编码格式
1 pd.read_csv('pydata-book-2nd-edition/examples/ex1.csv' )
a b c d message 0 1 2 3 4 hello 1 5 6 7 8 world 2 9 10 11 12 foo
1 pd.read_csv('pydata-book-2nd-edition/examples/ex2.csv' )
1 2 3 4 hello 0 5 6 7 8 world 1 9 10 11 12 foo
header = None
1 pd.read_csv('pydata-book-2nd-edition/examples/ex2.csv' , header=None )
0 1 2 3 4 0 1 2 3 4 hello 1 5 6 7 8 world 2 9 10 11 12 foo
names = list
1 pd.read_csv('pydata-book-2nd-edition/examples/ex2.csv' , names=['q' , 'w' , 'e' , 'r' , 'message' ])
q w e r message 0 1 2 3 4 hello 1 5 6 7 8 world 2 9 10 11 12 foo
index_col : 设置索引1 2 3 pd.read_csv('pydata-book-2nd-edition/examples/ex2.csv' , names=['q' , 'w' , 'e' , 'r' , 'message' ], index_col='message' )
q w e r message hello 1 2 3 4 world 5 6 7 8 foo 9 10 11 12
层次化索引
1 pd.read_csv('pydata-book-2nd-edition/examples/csv_mindex.csv' )
key1 key2 value1 value2 0 one a 1 2 1 one b 3 4 2 one c 5 6 3 one d 7 8 4 two a 9 10 5 two b 11 12 6 two c 13 14 7 two d 15 16
1 pd.read_csv('pydata-book-2nd-edition/examples/csv_mindex.csv' , index_col=['key1' , 'key2' ])
value1 value2 key1 key2 one a 1 2 b 3 4 c 5 6 d 7 8 two a 9 10 b 11 12 c 13 14 d 15 16
skiprows : 跳行1 list (open ('pydata-book-2nd-edition/examples/ex4.csv' ))
1 2 3 4 5 6 7 ['# hey!\n', 'a,b,c,d,message\n', '# just wanted to make things more difficult for you\n', '# who reads CSV files with computers, anyway?\n', '1,2,3,4,hello\n', '5,6,7,8,world\n', '9,10,11,12,foo']
1 2 pd.read_csv('pydata-book-2nd-edition/examples/ex4.csv' , skiprows=[0 , 2 , 3 ])
a b c d message 0 1 2 3 4 hello 1 5 6 7 8 world 2 9 10 11 12 foo
na_values : 处理缺失值1 list (open ('pydata-book-2nd-edition/examples/ex5.csv' ))
1 2 3 4 ['something,a,b,c,d,message\n', 'one,1,2,3,4,NA\n', 'two,5,6,,8,world\n', 'three,9,10,11,12,foo']
1 pd.read_csv('pydata-book-2nd-edition/examples/ex5.csv' )
something a b c d message 0 one 1 2 3.0 4 NaN 1 two 5 6 NaN 8 world 2 three 9 10 11.0 12 foo
na_value = dic/list
1 pd.read_csv('pydata-book-2nd-edition/examples/ex5.csv' , na_values=['one' , 'two' , 'three' ])
something a b c d message 0 NaN 1 2 3.0 4 NaN 1 NaN 5 6 NaN 8 world 2 NaN 9 10 11.0 12 foo
1 2 3 sentinels = {'message' : ['foo' , 'NA' ], 'something' : ['two' ]} pd.read_csv('pydata-book-2nd-edition/examples/ex5.csv' , na_values=sentinels)
something a b c d message 0 one 1 2 3.0 4 NaN 1 NaN 5 6 NaN 8 world 2 three 9 10 11.0 12 NaN
pd.read_table( path, sep, ... ) sep : 分隔符1 pd.read_table('pydata-book-2nd-edition/examples/ex1.csv' , sep=',' )
a b c d message 0 1 2 3 4 hello 1 5 6 7 8 world 2 9 10 11 12 foo
1 list (open ('pydata-book-2nd-edition/examples/ex3.txt' ))
1 2 3 4 5 [' A B C\n', 'aaa -0.264438 -1.026059 -0.619500\n', 'bbb 0.927272 0.302904 -0.032399\n', 'ccc -0.264273 -0.386314 -0.217601\n', 'ddd -0.871858 -0.348382 1.100491\n']
1 2 pd.read_table('pydata-book-2nd-edition/examples/ex3.txt' , sep='\s+' )
A B C aaa -0.264438 -1.026059 -0.619500 bbb 0.927272 0.302904 -0.032399 ccc -0.264273 -0.386314 -0.217601 ddd -0.871858 -0.348382 1.100491
逐块读取文件 pd.read_csv( path, nrows ) 1 pd.read_csv('pydata-book-2nd-edition/examples/ex6.csv' , nrows=5 )
one two three four key 0 0.467976 -0.038649 -0.295344 -1.824726 L 1 -0.358893 1.404453 0.704965 -0.200638 B 2 -0.501840 0.659254 -0.421691 -0.057688 G 3 0.204886 1.074134 1.388361 -0.982404 R 4 0.354628 -0.133116 0.283763 -0.837063 Q
pd.read_csv( path, chunksize ) : 迭代器1 2 3 4 5 6 7 8 9 10 11 chunker = pd.read_csv('pydata-book-2nd-edition/examples/ex6.csv' , chunksize=1000 ) tot = pd.Series([]) for piece in chunker: tot = tot.add(piece['key' ].value_counts(), fill_value=0 ) tot = tot.sort_values(ascending=False ) tot[:10 ]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 <ipython-input-19-c6c0008a71bc>:3: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. tot = pd.Series([]) E 368.0 X 364.0 L 346.0 O 343.0 Q 340.0 M 338.0 J 337.0 F 335.0 K 334.0 H 330.0 dtype: float64
写文件 frames.to_csv( path, sep, na_rep, index, header, columns ) 1 2 data = pd.read_csv('pydata-book-2nd-edition/examples/ex5.csv' ) data
something a b c d message 0 one 1 2 3.0 4 NaN 1 two 5 6 NaN 8 world 2 three 9 10 11.0 12 foo
1 data.to_csv('pydata-book-2nd-edition/examples/out.csv' )
sep : 分隔符号1 2 data.to_csv(sys.stdout, sep='|' )
1 2 3 4 |something|a|b|c|d|message 0|one|1|2|3.0|4| 1|two|5|6||8|world 2|three|9|10|11.0|12|foo
na_rep : 缺失值1 data.to_csv(sys.stdout, na_rep='-9999' )
1 2 3 4 ,something,a,b,c,d,message 0,one,1,2,3.0,4,-9999 1,two,5,6,-9999,8,world 2,three,9,10,11.0,12,foo
1 data.to_csv(sys.stdout, index=False , header=False )
1 2 3 one,1,2,3.0,4, two,5,6,,8,world three,9,10,11.0,12,foo
columns : 输出指定的行或列1 data.to_csv(sys.stdout, index=False , columns=['a' , 'b' , 'c' ])
1 2 3 4 a,b,c 1,2,3.0 5,6, 9,10,11.0
series.to_csv( path ) 1 2 3 dates = pd.date_range('1/1/2000' , periods=7 ) obj = pd.Series(np.arange(7 ), index=dates) obj.to_csv(sys.stdout)
1 2 3 4 5 6 7 8 ,0 2000-01-01,0 2000-01-02,1 2000-01-03,2 2000-01-04,3 2000-01-05,4 2000-01-06,5 2000-01-07,6
读写 Excel 文件 pd.ExcelFile( path ) + pd.read_excel( ExcelFile, sheet_name ) 读取一个文件的多个 sheet 时, 先创建 ExcelFile 传递路径, 然后使用 read_excel 读取, 速度更快
1 2 xlsx = pd.ExcelFile('pydata-book-2nd-edition/examples/ex1.xlsx' ) pd.read_excel(xlsx, 'Sheet1' )
Unnamed: 0 a b c d message 0 0 1 2 3 4 hello 1 1 5 6 7 8 world 2 2 9 10 11 12 foo
pd.read_excel( path, sheet_name ) 直接传递文件路径到 pd.read_excel 中
1 pd.read_excel('pydata-book-2nd-edition/examples/ex1.xlsx' , 'Sheet1' )
Unnamed: 0 a b c d message 0 0 1 2 3 4 hello 1 1 5 6 7 8 world 2 2 9 10 11 12 foo
pd.ExcelWriter( path ) + frame.to_excel( ExcelWriter, sheet_name ) + ExcelWriter.save( ) 首先创建 ExcelWriter , 然后使用 to_excel 方法将数据写入到其中
1 2 3 4 frame = pd.read_excel('pydata-book-2nd-edition/examples/ex1.xlsx' , 'Sheet1' ) writer = pd.ExcelWriter('pydata-book-2nd-edition/examples/ex2.xlsx' ) frame.to_excel(writer, 'Sheet1' ) writer.save()
frame.to_excel( path, sheet_name ) 直接传递文件路径到 to_excel 中
1 frame.to_excel('pydata-book-2nd-edition/examples/ex2.xlsx' , 'Sheet1' )