csv文件读取

In [1]:
import pandas as pd
import sys
In [2]:
%cat examples/ex2.csv
 
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
In [3]:
#文件没有标签数据
pd.read_csv('examples/ex2.csv',header=None)
Out[3]:
 
01234hello
15678world
29101112foo
In [4]:
pd.read_csv('examples/ex2.csv',names=['a','b','c','d','massage'])
Out[4]:
 
01234hello
15678world
29101112foo
In [5]:
#将其中的一列设为索引列
pd.read_csv('examples/ex2.csv',names=['a','b','c','d','massage'],index_col='massage')
Out[5]:
 
hello1234
world5678
foo9101112
In [6]:
list(open('examples/ex3.txt'))
Out[6]:
['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']
In [7]:
#多出的一列数据自动识别为索引,分隔符不同使用正则表达式
pd.read_csv('examples/ex3.csv',sep='\s+')
Out[7]:
 
aaa-0.264438-1.026059-0.619500
bbb0.9272720.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.3483821.100491
 

将读取的非空数据设为NaN

In [8]:
%cat examples/ex5.csv
 
something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo
In [9]:
pd.read_csv('examples/ex5.csv',na_values={'something':'two','massage':['NA','foo']})
Out[9]:
 
0one123.04NaN
1NaN56NaN8world
2three91011.012foo
In [10]:
#只读取一部分数据
pd.read_csv('examples/ex6.csv',nrows=10)
Out[10]:
 
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q
51.8174800.7422730.419395-2.251035Q
6-0.7767640.935518-0.332872-1.875641U
7-0.9131351.530624-0.5726570.477252K
80.358480-0.497572-0.3670160.507702S
9-1.740877-1.160417-1.6378302.172201G
In [11]:
#将数据分块读取
chunker = pd.read_csv('examples/ex6.csv',chunksize=1000)
for piece in chunker:
    print(piece.iloc[0])
 
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              L
Name: 0, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              T
Name: 1000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              1
Name: 2000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              H
Name: 3000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              H
Name: 4000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              1
Name: 5000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              I
Name: 6000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              1
Name: 7000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              7
Name: 8000, dtype: object
one       0.467976
two     -0.0386485
three    -0.295344
four      -1.82473
key              B
Name: 9000, dtype: object
 

写入csv文件

In [12]:
data = pd.read_csv('examples/ex5.csv')
data.to_csv(sys.stdout)
 
,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo
In [13]:
data.to_csv(sys.stdout,sep='|')
 
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
In [14]:
#对缺失值进行标识
data.to_csv(sys.stdout,na_rep='NULL')
 
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
In [15]:
data.to_csv(sys.stdout,index=False,header=False)
 
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
In [16]:
#只写入子集
data.to_csv(sys.stdout,index=False,columns=['a','b','c'])
 
a,b,c
1,2,3.0
5,6,
9,10,11.0
 

json文件

In [17]:
%cat examples/example.json
 
[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]
In [18]:
data = pd.read_json('examples/example.json')
data
Out[18]:
 
0123
1456
2789
In [19]:
data.to_json(sys.stdout)
 
{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
In [20]:
#按行写入
data.to_json(sys.stdout,orient='records')
 
[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]
 

HTML

In [21]:
#搜索并解析包含在table标签中的数据
tables = pd.read_html('examples/fdic_failed_bank_list.html')
#只有一张表格
len(tables)
Out[21]:
1
In [22]:
data = tables[0]
data.head()
Out[22]:
 
0Allied BankMulberryAR91Today's BankSeptember 23, 2016November 17, 2016
1The Woodbury Banking CompanyWoodburyGA11297United BankAugust 19, 2016November 17, 2016
2First CornerStone BankKing of PrussiaPA35312First-Citizens Bank & Trust CompanyMay 6, 2016September 6, 2016
3Trust Company BankMemphisTN9956The Bank of Fayette CountyApril 29, 2016September 6, 2016
4North Milwaukee State BankMilwaukeeWI20364First-Citizens Bank & Trust CompanyMarch 11, 2016June 16, 2016
 

excel

In [23]:
data = pd.read_excel('examples/ex1.xlsx','Sheet1')
data
Out[23]:
 
001234hello
115678world
229101112foo
In [24]:
data.to_excel('examples/ex2.xlsx')
 

Web API

In [25]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp
Out[25]:
<Response [200]>
In [26]:
data = resp.json()#data为字典数组
issues = pd.DataFrame(data,columns=['title','url','state','labels'])#提取部分字段
issues.head()
Out[26]:
 
0Resample on MultiIndex level is much longer th...https://api.github.com/repos/pandas-dev/pandas...open[]
1BUG: value_counts can handle the case even wit...https://api.github.com/repos/pandas-dev/pandas...open[]
2Build error on Arch linuxhttps://api.github.com/repos/pandas-dev/pandas...open[]
3BUG: DataFrame.to_html validates formatters ha...https://api.github.com/repos/pandas-dev/pandas...open[]
4BUG DataFream.rindex方法参数的错误https://api.github.com/repos/pandas-dev/pandas...open[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...
01-25 08:53