基本使用
前情提要:
pandas 是我们数据分析的核心包
pandas相当于是表数据
series相当于表数据的一列
: Part a 关于基础:
1 #结构化的数据分析
2
3 pip3 install jupyter
4 pip3 install pandas
5
6
7 #import pandas as pd
8 #import numpy as np
9
10 #事例一:
11
12 #s = pd.Series([1,3,5,np.NaN,8,4])
13 #s
14
15 Out[5]:
16 0 1.0
17 1 3.0
18 2 5.0
19 3 NaN
20 4 8.0
21 5 4.0
22 dtype: float64
23
24 #事例二:
25
26 #创建日期序列,里面有6个日期
27 #datas = pd.date_range(‘20170821‘,periods=6)
28 #datas
29
30 DatetimeIndex([‘2017-08-21‘, ‘2017-08-22‘, ‘2017-08-23‘, ‘2017-08-24‘,‘2017-08-25‘, ‘2017-08-26‘], dtype=‘datetime64[ns]‘, freq=‘D‘)
31
32
33 #创建二维数组
34
35 #创建6行4列的随机数,索引datas,列ABCD
36
37 #data = pd.DataFrame(np.random.randn(6,4),index=datas,columns=list(‘ABCD‘))
38 #data
39
40 A B C D
41 2017-08-21 -0.245344 0.260401 -2.003621 0.427142
42 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211
43 2017-08-23 0.396086 -0.731911 -0.100242 0.966344
44 2017-08-24 0.761821 -0.159621 -1.172729 -1.317056
45 2017-08-25 0.186582 0.739702 -1.688458 0.480121
46 2017-08-26 -0.519489 -0.002741 0.875164 -0.783657
47
48 #data.values
49
50 array([[ -2.45343824e-01, 2.60401419e-01, -2.00362100e+00, 4.27142120e-01],
51 [ -2.77384841e+00, 1.60472878e+00, -7.11768546e-01, -6.77211441e-01],
52 [ 3.96086166e-01, -7.31910686e-01, -1.00241967e-01, 9.66344486e-01],
53 [ 7.61820910e-01, -1.59621471e-01, -1.17272904e+00, -1.31705593e+00],
54 [ 1.86582291e-01, 7.39702155e-01, -1.68845777e+00, 4.80121151e-01],
55 [ -5.19488872e-01, -2.74128435e-03, 8.75164076e-01, -7.83656946e-01]])
56
57
58 #data.head(2) #看前两行的数据
59 A B C D
60 2017-08-21 -0.245344 0.260401 -2.003621 0.427142
61 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211
62
63
64 #data.T #行列转置
65
66 2017-08-21 2017-08-22 2017-08-23 2017-08-24 2017-08-25 2017-08-26
67 A -0.245344 -2.773848 0.396086 0.761821 0.186582 -0.519489
68 B 0.260401 1.604729 -0.731911 -0.159621 0.739702 -0.002741
69 C -2.003621 -0.711769 -0.100242 -1.172729 -1.688458 0.875164
70 D 0.427142 -0.677211 0.966344 -1.317056 0.480121 -0.783657
71
72
73 #data.sort_index(axis=1) #列正序
74
75 A B C D
76 2017-08-21 -0.245344 0.260401 -2.003621 0.427142
77 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211
78 2017-08-23 0.396086 -0.731911 -0.100242 0.966344
79 2017-08-24 0.761821 -0.159621 -1.172729 -1.317056
80 2017-08-25 0.186582 0.739702 -1.688458 0.480121
81 2017-08-26 -0.519489 -0.002741 0.875164 -0.783657
82
83
84 #data.sort_index(axis=1,ascending=False) #列倒序
85
86 D C B A
87 2017-08-21 0.427142 -2.003621 0.260401 -0.245344
88 2017-08-22 -0.677211 -0.711769 1.604729 -2.773848
89 2017-08-23 0.966344 -0.100242 -0.731911 0.396086
90 2017-08-24 -1.317056 -1.172729 -0.159621 0.761821
91 2017-08-25 0.480121 -1.688458 0.739702 0.186582
92 2017-08-26 -0.783657 0.875164 -0.002741 -0.519489
93
94
95 #data.sort_index(axis=0,ascending=False) #行倒序
96
97 A B C D
98 2017-08-26 -0.519489 -0.002741 0.875164 -0.783657
99 2017-08-25 0.186582 0.739702 -1.688458 0.480121
100 2017-08-24 0.761821 -0.159621 -1.172729 -1.317056
101 2017-08-23 0.396086 -0.731911 -0.100242 0.966344
102 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211
103 2017-08-21 -0.245344 0.260401 -2.003621 0.427142
104
105
106 #data.sort_values(by=‘A‘) # A列排序
107
108 A B C D
109 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211
110 2017-08-26 -0.519489 -0.002741 0.875164 -0.783657
111 2017-08-21 -0.245344 0.260401 -2.003621 0.427142
112 2017-08-25 0.186582 0.739702 -1.688458 0.480121
113 2017-08-23 0.396086 -0.731911 -0.100242 0.966344
114 2017-08-24 0.761821 -0.159621 -1.172729 -1.317056
115
116
117 #data[‘A‘] 或 data.A #数据选择
118
119 2017-08-21 -0.245344
120 2017-08-22 -2.773848
121 2017-08-23 0.396086
122 2017-08-24 0.761821
123 2017-08-25 0.186582
124 2017-08-26 -0.519489
125
126
127 data.loc[‘2017-08-21‘:‘2017-08-23‘] #选择哪几行
128
129 A B C D
130 2017-08-21 -0.245344 0.260401 -2.003621 0.427142
131 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211
132 2017-08-23 0.396086 -0.731911 -0.100242 0.966344
133
134
135 #data.iloc[2:4] #行
136
137 A B C D
138 2017-08-23 0.396086 -0.731911 -0.100242 0.966344
139 2017-08-24 0.761821 -0.159621 -1.172729 -1.317056
140
141
142 #data.loc[:,[‘B‘,‘C‘]] #取出列
143
144 B C
145 2017-08-21 0.260401 -2.003621
146 2017-08-22 1.604729 -0.711769
147 2017-08-23 -0.731911 -0.100242
148 2017-08-24 -0.159621 -1.172729
149 2017-08-25 0.739702 -1.688458
150 2017-08-26 -0.002741 0.875164
151
152
153 #data.loc[‘2017-08-21‘:‘2017-08-23‘,[‘B‘,‘C‘]] #指定行和列
154
155 B C
156 2017-08-21 0.260401 -2.003621
157 2017-08-22 1.604729 -0.711769
158 2017-08-23 -0.731911 -0.100242
159
160
161 #data.loc[‘2017-08-21‘,‘B‘] #访问特定的值
162
163 0.26040141861580018
164
165 #data.at[pd.Timestamp(‘2017-08-21‘),‘B‘] #比上面效率高
166
167 #0.26040141861580018
168
169
170 #data.A = range(6) #修改列数据
171 #data
172 A B C D
173 2017-08-21 0 0.260401 -2.003621 0.427142
174 2017-08-22 1 1.604729 -0.711769 -0.677211
175 2017-08-23 2 -0.731911 -0.100242 0.966344
176 2017-08-24 3 -0.159621 -1.172729 -1.317056
177 2017-08-25 4 0.739702 -1.688458 0.480121
178 2017-08-26 5 -0.002741 0.875164 -0.783657
179
180
181 #data.iloc[:,2:5] = 1000 #修改某几列
182 #data
183
184 A B C D
185 2017-08-21 0 0.260401 1000 1000
186 2017-08-22 1 1.604729 1000 1000
187 2017-08-23 2 -0.731911 1000 1000
188 2017-08-24 3 -0.159621 1000 1000
189 2017-08-25 4 0.739702 1000 1000
190 2017-08-26 5 -0.002741 1000 1000
191
192
193 #事例三:
194
195 #d = {‘A‘:1,‘B‘:pd.Timestamp("20160821"),‘C‘:list(range(4)),‘D‘:np.arange(4)}
196 #d
197
198 {‘A‘: 1,
199 ‘B‘: Timestamp(‘2016-08-21 00:00:00‘),
200 ‘C‘: [0, 1, 2, 3],
201 ‘D‘: array([0, 1, 2, 3])}
202
203 df = pd.DataFrame(d)
204 #df
205
206 A B C D
207 0 1 2016-08-21 0 0
208 1 1 2016-08-21 1 1
209 2 1 2016-08-21 2 2
210 3 1 2016-08-21 3 3
211
212 #df.dtypes
213 A int64
214 B datetime64[ns]
215 C int64
216 D int64
: Part b 关于DateFrame :
#创建方式 #pd.DataFrame({‘one‘:[1,2,3,4],‘two‘:[4,3,2,1]}) one two 0 1 4 1 2 3 2 3 2 3 4 1 #pd.DataFrame({‘one‘:[1,2,3],‘two‘:[3,2,1]},index=list(‘abc‘)) one two a 1 3 b 2 2 c 3 1 #pd.DataFrame({‘one‘: pd.Series([1,2,3],index=[‘a‘,‘b‘,‘c‘]),‘two‘ :pd.Series([1,2,3,4],index=[‘b‘,‘a‘,‘c‘,‘d‘])}) one two a 1.0 2 b 2.0 1 c 3.0 3 d NaN 4
: Part c 关于碎知识点
1 #obj = pd.read_csv(‘601318.csv‘) #从文件中导入 2 #obj.to_csv(‘new_obj.csv‘) #导出 3 4 #obj.index #获取行索引 5 #obj.columns #获取列索引 6 #obj.values #返回所有的值 7 #obj.rename(columns={‘close‘:‘new_close‘}) #给列改名称 8 9 # loc 关于标签 10 11 #obj.loc[:,[‘close‘,‘open‘]] #所有行的两列 12 #obj.loc[0:10,[‘close‘,‘open‘]] #前10行的两列 0可以省略 13 14 # iloc 只能位置(下标) 15 16 #obj.iloc[0:10,0:3] #前10行的前3列 17 #obj[obj[‘open‘]<20] #open小于20的
:Part d 关于pandas,numpy,pyplot综合使用
import pandas as pd import numpy as np import matplotlib.pyplot as plt #资本显示 #dates = pd.date_range(‘20160821‘,periods=6) #df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list(‘ABCD‘)) #df A B C D 2016-08-21 0.395062 -0.713272 0.560768 1.175623 2016-08-22 0.487270 0.541010 -1.451927 -0.374084 2016-08-23 1.190535 -0.017564 1.065334 -0.716946 2016-08-24 -0.231604 0.890002 -2.085083 -0.421781 2016-08-25 -0.282863 0.039613 0.900477 -1.048180 2016-08-26 0.238394 -0.086092 0.002821 1.893919 # 增加一列 E #df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + [‘E‘]) #df1 A B C D E 2016-08-21 0.395062 -0.713272 0.560768 1.175623 NaN 2016-08-22 0.487270 0.541010 -1.451927 -0.374084 NaN 2016-08-23 1.190535 -0.017564 1.065334 -0.716946 NaN 2016-08-24 -0.231604 0.890002 -2.085083 -0.421781 NaN # 修改列值 #df1.loc[dates[1:3],‘E‘] = 2 #df1 A B C D E 2016-08-21 0.395062 -0.713272 0.560768 1.175623 NaN 2016-08-22 0.487270 0.541010 -1.451927 -0.374084 2.0 2016-08-23 1.190535 -0.017564 1.065334 -0.716946 2.0 2016-08-24 -0.231604 0.890002 -2.085083 -0.421781 NaN #有空数据行的丢掉 #df1.dropna() A B C D E 2016-08-22 0.487270 0.541010 -1.451927 -0.374084 2.0 2016-08-23 1.190535 -0.017564 1.065334 -0.716946 2.0 # 有空的值用默认值替换 #df1.fillna(value=5) A B C D E 2016-08-21 0.395062 -0.713272 0.560768 1.175623 5.0 2016-08-22 0.487270 0.541010 -1.451927 -0.374084 2.0 2016-08-23 1.190535 -0.017564 1.065334 -0.716946 2.0 2016-08-24 -0.231604 0.890002 -2.085083 -0.421781 5.0 # 判断是否有空数据 #pd.isnull(df1) A B C D E 2016-08-21 False False False False True 2016-08-22 False False False False False 2016-08-23 False False False False False 2016-08-24 False False False False True # 求行平均值,空数据不参与计算 #df1.mean() A 0.460316 B 0.175044 C -0.477727 D -0.084297 E 2.000000 dtype: float64 # 求列平均值 #df1.mean(axis=1) A 0.460316 B 0.175044 C -0.477727 D -0.084297 E 2.000000 dtype: float64 # 累加 (把列传给参数处理) #df.apply(np.cumsum) A B C D 2016-08-21 -2.251855 0.967517 0.045508 1.011237 2016-08-22 -0.159759 -1.011472 0.642207 1.451304 2016-08-23 0.595429 -1.164885 -1.323172 2.214757 2016-08-24 0.582950 0.299636 1.421000 2.018076 2016-08-25 0.624217 0.838486 1.840071 1.241466 2016-08-26 0.102015 -0.251384 0.422820 0.166844 #每列的最大值 减去最小值 #df.apply(lambda x: x.max() - x.min()) A 4.343951 B 3.443510 C 4.709552 D 2.085859 dtype: float64 #追加一行 #df.append(s,ignore_index=True) ####-------------- concat()--------------- #df = pd.DataFrame(np.random.randn(10,4),columns=list(‘ABCD‘)) #df A B C D 0 0.448883 0.667777 1.974642 -0.598013 1 -0.384818 -1.616549 1.551960 0.201467 2 -1.428748 0.122964 0.837514 1.102314 3 0.634624 0.553064 -0.248767 0.019210 4 2.518095 0.240933 -0.722795 -0.815324 5 0.517742 -2.444094 1.270380 -0.160528 6 -0.800297 -1.242557 -1.118255 0.616456 7 0.953936 0.318387 0.151009 0.340203 8 0.558308 -1.344539 0.846960 1.264978 9 1.549733 1.496383 0.208102 1.265871 #df.iloc[:3] # 取前3行 #df.iloc[3:7] # 3-7行 #df.iloc[7:] # 7-最后一行 #df1 = pd.concat([df.iloc[:3],df.iloc[3:7],df.iloc[7:]]) #等于上面3行 #df1 A B C D 0 -0.151648 -0.077165 1.130773 1.587466 1 -0.854935 0.490470 -0.438102 -0.066202 2 -0.622838 -1.524100 -0.995986 -1.378272 3 -0.718499 0.184935 -1.164265 1.127623 4 2.180069 0.799651 1.630780 -1.592882 5 -0.590875 -0.824192 -0.109713 -1.502221 6 0.437851 2.268239 0.305557 -0.515982 7 -1.084059 -0.325458 1.279105 -0.404447 8 0.134603 0.012700 0.935611 1.017555 9 0.521873 0.630876 -1.714364 -1.243454 #判断 df1 和 df 是否相等 #df == df1 A B C D 0 True True True True 1 True True True True 2 True True True True 3 True True True True 4 True True True True 5 True True True True 6 True True True True 7 True True True True 8 True True True True 9 True True True True #(df == df1).all() A True B True C True D True #(df == df1).all().all() True ###------------- merge() ---------- #left = pd.DataFrame({‘key‘:[‘foo‘,‘foo‘],‘rval‘:[1,2]}) #right = pd.DataFrame({‘key‘:[‘foo‘,‘foo‘],‘rval‘:[4,5]}) #left key rval 0 foo 1 1 foo 2 #right key rval 0 foo 4 1 foo 5 #按照key关联 #pd.merge(left,right,on=‘key‘) key rval_x rval_y 0 foo 1 4 1 foo 1 5 2 foo 2 4 3 foo 2 5 ###------------- groupby() ----------- #df = pd.DataFrame({‘A‘ : [‘foo‘,‘bar‘,‘foo‘,‘bar‘], ‘B‘ : [‘one‘,‘two‘,‘one‘,‘two‘], ‘C‘: np.random.randn(4), ‘D‘: np.random.randn(4) }) #df A B C D 0 foo one 1.261140 1.109300 1 bar two -2.072375 -0.533420 2 foo one 1.470090 0.243500 3 bar two -0.243230 -0.721201 #分组 #df.groupby(‘A‘).sum() A C D bar -2.315605 -1.254621 foo 2.731229 1.352800 #df.groupby([‘A‘,‘B‘]).sum() A B C D bar two -1.039401 -0.338988 foo one -1.004454 1.169985 -------------------------序列--------------------- #s = pd.Series(np.random.randint(10,20,size=20)) #s 0 13 1 16 2 11 3 13 4 17 5 18 6 12 7 11 8 13 9 12 10 18 11 15 12 19 13 10 14 16 15 10 16 16 17 17 18 10 19 13 # 每个数字产生的次数 #s.value_counts() 15 5 18 3 13 3 17 2 12 2 11 2 19 1 16 1 10 1 # 产生最多的数 #s.mode() 0 15 ---------------------------------------------------------- # 创建序列 包含空值 #s = pd.Series([1,3,5,np.nan,6,8],index=dates) #s 2016-08-21 1.0 2016-08-22 3.0 2016-08-23 5.0 2016-08-24 NaN 2016-08-25 6.0 2016-08-26 8.0 Freq: D, dtype: float64 In [12]: #s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) #s 2016-08-21 NaN 2016-08-22 NaN 2016-08-23 1.0 2016-08-24 3.0 2016-08-25 5.0 2016-08-26 NaN Freq: D, dtype: float64 #df A B C D 2016-08-21 -2.251855 0.967517 0.045508 1.011237 2016-08-22 2.092096 -1.978989 0.596700 0.440067 2016-08-23 0.755188 -0.153414 -1.965380 0.763453 2016-08-24 -0.012479 1.464521 2.744173 -0.196681 2016-08-25 0.041267 0.538850 0.419071 -0.776610 2016-08-26 -0.522202 -1.089871 -1.417252 -1.074622 # 二维 减去 序列 (空值不参与运算) #df.sub(s,axis=‘index‘) A B C D 2016-08-21 NaN NaN NaN NaN 2016-08-22 NaN NaN NaN NaN 2016-08-23 -0.244812 -1.153414 -2.965380 -0.236547 2016-08-24 -3.012479 -1.535479 -0.255827 -3.196681 2016-08-25 -4.958733 -4.461150 -4.580929 -5.776610 2016-08-26 NaN NaN NaN NaN
:Part e 收尾 行列互换
1 #数据整形 (把数据的行和列互换) 2 3 4 %matplotlib inline 5 import pandas as pd 6 import numpy as np 7 import matplotlib.pyplot as plt 8 9 #行索引 10 11 #tuples = list(zip(*[[‘bar‘,‘bar‘,‘baz‘,‘baz‘,‘foo‘,‘foo‘,‘qux‘,‘qux‘],[‘one‘,‘two‘,‘one‘,‘two‘,‘one‘,‘two‘,‘one‘,‘two‘]])) 12 #tuples 13 14 [(‘bar‘, ‘one‘), 15 (‘bar‘, ‘two‘), 16 (‘baz‘, ‘one‘), 17 (‘baz‘, ‘two‘), 18 (‘foo‘, ‘one‘), 19 (‘foo‘, ‘two‘), 20 (‘qux‘, ‘one‘), 21 (‘qux‘, ‘two‘)] 22 #index = pd.MultiIndex.from_tuples(tuples, names=[‘first‘,‘second‘]) 23 #index 24 25 MultiIndex(levels=[[‘bar‘, ‘baz‘, ‘foo‘, ‘qux‘], [‘one‘, ‘two‘]], 26 labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]], 27 names=[‘first‘, ‘second‘]) 28 29 30 #df = pd.DataFrame(np.random.randn(8,2),index=index,columns=[‘A‘,‘B‘]) 31 #df 32 A B 33 first second 34 bar one 1.109199 -0.482758 35 two 0.947951 1.284311 36 baz one -0.733705 -0.088907 37 two 1.146346 -0.946909 38 foo one 0.254840 1.868951 39 two 1.052604 -0.684604 40 qux one 0.133846 0.065647 41 two -1.137040 1.010329 42 43 44 #把列索引变成行索引 45 46 #stacked = df.stack() 47 #stacked 48 49 first second 50 bar one A 1.109199 51 B -0.482758 52 two A 0.947951 53 B 1.284311 54 baz one A -0.733705 55 B -0.088907 56 two A 1.146346 57 B -0.946909 58 foo one A 0.254840 59 B 1.868951 60 two A 1.052604 61 B -0.684604 62 qux one A 0.133846 63 B 0.065647 64 two A -1.137040 65 B 1.010329