01.pandas
Posted kingboy100
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了01.pandas相关的知识,希望对你有一定的参考价值。
01.Series
1 # -*- coding: utf-8 -*- 2 """ 3 Series ?? ?? 4 - pandas ?? 1?? ???? 5 - DataFrame ?? ???? 6 - ??/?? ?? ?? ?? 7 - ?? ??, ?? ?? 8 - indexing/slicing(list ??) 9 - ??? ??? ?? 10 """ 11 12 import pandas as pd # pd.Series() 13 from pandas import Series # Series() 14 15 # 1. Series ?? 16 17 # 1) list ?? 18 lst = [4000, 3000, 2000, 3500] 19 print(lst*2) 20 price = Series([4000, 3000, 2000, 3500]) 21 print(price*2) 22 23 print(price.index) # index 24 print(price.values) # data 25 26 print(lst[0], price[0]) # 4000 4000 27 28 # 2) dict ?? : key=index : value=values 29 person = pd.Series({‘name‘:‘???‘, ‘age‘:35, ‘addr‘ :‘???‘}) 30 print(person) 31 ‘‘‘ 32 addr ??? 33 age 35 34 name ??? 35 ‘‘‘ 36 print(person[‘age‘]) # 35 37 38 # 2. indexing(list? ??) 39 ser_data = pd.Series([4, 4.5, 6, 8, 10.5]) 40 print(len(ser_data)) # 5 41 42 print(ser_data[0]) # 4.0 43 print(ser_data[:3]) # 3? 44 print(ser_data[3:]) # 2? 45 print(ser_data[:]) # ?? 46 #print(ser_data[-1]) 47 48 # boolean ??? 49 print(ser_data[ser_data >= 5]) 50 ‘‘‘ 51 2 6.0 52 3 8.0 53 4 10.5 54 ‘‘‘ 55 56 57 # 3. Series ??, NA ?? 58 data1 = Series([4000, None, 3500, 2000], 59 index=[‘a‘, ‘m‘, ‘o‘, ‘k‘]) 60 data2 = Series([4000, 3000, 3500, 2000], 61 index=[‘a‘, ‘o‘, ‘k‘, ‘m‘]) 62 # join : index ?? 63 result = data1 + data2 # ?? ?? 64 print(result) 65 print(type(result)) # Series‘ 66 ‘‘‘ 67 a 8000.0 68 k 5500.0 69 m NaN -> ??? 70 o 6500.0 71 ‘‘‘ 72 73 # NA ?? : 0, ?? ??, ?? 74 75 result2 = result.fillna(0) # 0 ?? 76 result3 = result.fillna(result.mean()) # ?? ?? 77 print(‘0 ?? :‘, result2) 78 print(‘?? ?? :‘, result3) 79 ‘‘‘ 80 0 ?? : a 8000.0 81 k 5500.0 82 m 0.0 83 o 6500.0 84 dtype: float64 85 ?? ?? : a 8000.000000 86 k 5500.000000 87 m 6666.666667 88 o 6500.000000 89 ‘‘‘ 90 91 print(pd.notnull(result)) 92 ‘‘‘ 93 a True 94 k True 95 m False 96 o True 97 ‘‘‘ 98 # ???? ??? subset ?? 99 subset = result[pd.notnull(result)] 100 print(subset) 101 ‘‘‘ 102 a 8000.0 103 k 5500.0 104 o 6500.0 105 ‘‘‘ 106 107 # 4. Series ?? 108 print(ser_data) 109 110 111 # 1) ???? 112 ser_data[1:4] = 50 113 print(ser_data) 114 115 # 2) ??/?? ?? 116 print(ser_data.sum()) 117 print(ser_data.mean()) 118 print(ser_data.max()) 119 print(ser_data.min()) 120 121 # 3) broacast ?? 122 print(ser_data * 0.5) # vector(1) * scala(0) 123 ‘‘‘ 124 0 2.00 125 1 25.00 126 2 25.00 127 3 25.00 128 4 5.25 129 ‘‘‘
02.DataFrame
1 # -*- coding: utf-8 -*- 2 """ 3 DataFrame ?? ?? 4 - pandas ?? 2?? ????(table ?? ??) 5 - ?? ?? ??? ??? ?? 6 - DataFrame ???? 7 -> Series : 1??(vector) 8 -> Numpy : 1??(vector) 9 """ 10 11 import pandas as pd # pd.DataFrame() 12 from pandas import DataFrame # DataFrame() 13 14 # 1. DataFrame ?? 15 16 name = [‘???‘, ‘???‘, ‘???‘, ‘???‘] 17 age = [35,45,55,25] 18 pay = [350,450,550,250] 19 emp = DataFrame({‘name‘:name, ‘age‘:age, ‘pay‘:pay}, 20 columns=[‘name‘, ‘age‘, ‘pay‘]) 21 print(emp) 22 ‘‘‘ 23 name age pay 24 0 ??? 35 350 25 1 ??? 45 450 26 2 ??? 55 550 27 3 ??? 25 250 28 ‘‘‘ 29 30 # 1) Series ?? ?? : column ?? 31 gender = pd.Series([‘M‘,‘M‘,‘M‘, ‘F‘]) 32 emp[‘gender‘] = gender 33 print(emp) 34 35 # 2) Numpy ?? ?? 36 import numpy as np 37 frame = pd.DataFrame(np.arange(12).reshape(3,4), 38 columns=[‘a‘,‘b‘,‘c‘,‘d‘]) 39 print(frame) 40 ‘‘‘ 41 a b c d 42 0 0 1 2 3 43 1 4 5 6 7 44 2 8 9 10 11 45 ‘‘‘ 46 47 # ?/? ?? ??? 48 print(frame.mean()) # ? ?? ?? 49 print(frame.mean(axis = 0)) # ? ?? ?? 50 print(frame.mean(axis = 1)) # ? ?? ?? 51 52 # 2. index ?? 53 print(frame.index) # RangeIndex(start=0, stop=3, step=1) 54 print(frame.values) 55 ‘‘‘ 56 [[ 0 1 2 3] 57 [ 4 5 6 7] 58 [ 8 9 10 11]] 59 ‘‘‘ 60 print(frame.columns) 61 # Index([‘a‘, ‘b‘, ‘c‘, ‘d‘], dtype=‘object‘) 62 63 # 1) ?? ??(a) ?? index ?? 64 setIdx = frame.set_index(‘a‘) 65 print(setIdx) 66 67 # 2) index ??? 68 resetIdx = setIdx.reset_index() 69 print(resetIdx) 70 71 72 # 3. DF ?? ?? 73 74 # 1) ?? ?? ?? 75 a_col1 = frame.a # DF.column 76 a_col2 = frame[‘a‘] # DF[‘column‘] 77 print(a_col1) 78 print(a_col2) 79 print(frame[‘a‘][2]) # 8 DF[‘column‘][index] 80 81 # 2) ?? ?? ?? 82 print(frame[[‘a‘, ‘c‘]]) # [[‘a‘:‘c‘]](x) 83 cols = [‘a‘, ‘d‘] # list 84 frame[cols] 85 86 87 # 4. subset ??? 88 89 # 1) ?? ?? ?? 90 print(‘subset1‘) 91 subset_df = frame[[‘a‘,‘c‘,‘d‘]] 92 print(subset_df) 93 94 # 2) ?? ? ?? 95 print(‘drop‘) 96 print(frame.drop(0)) # 1? ?? 97 print(frame.drop(1)) # 2? ?? 98 ‘‘‘ 99 ?? ??? ??? ? new object ?? 100 ?? object? ??? 101 ‘‘‘ 102 103 a_col = frame[‘a‘] # DF(2) -> vector(1) 104 print(type(a_col)) # Series 105 106 # a?? ???? ? ?? 107 subset_df2 = frame # df ?? 108 print(subset_df2) 109 110 for i, c in enumerate(a_col) : 111 print(‘i=‘, i, ‘c=‘, c) 112 if c < 5 : 113 subset_df2 = subset_df2.drop(i) 114 115 ‘‘‘ 116 i= 0 c= 0 117 i= 1 c= 4 118 i= 2 c= 8 119 ‘‘‘ 120 print(subset_df2) 121 122 123 # 3) ??? ?? ?? 124 iris = pd.read_csv(‘../data/iris.csv‘) 125 print(iris.info()) 126 ‘‘‘ 127 RangeIndex: 150 entries, 0 to 149 128 Data columns (total 5 columns): 129 ‘‘‘ 130 print(type(iris)) # DataFrame 131 print(iris.columns) 132 cols = list(iris.columns) # ??? ?? 133 print(cols) 134 ‘‘‘ 135 [‘Sepal.Length‘, ‘Sepal.Width‘, ‘Petal.Length‘, ‘Petal.Width‘, ‘Species‘] 136 ‘‘‘ 137 138 print(iris[cols[0]]) # ??? ?? 139 print(iris[cols[-1]]) # ??? ?? 140 # 1~3?? ?? 141 print(iris[[‘Sepal.Length‘, ‘Sepal.Width‘, ‘Petal.Length‘]]) 142 print(iris[cols[:3]]) # ?? 143 144 print(iris.head()) 145 146 147 # 1~4?? : x, 5?? : y 148 iris_x = iris[cols[:4]] 149 iris_y = iris[cols[-1]] 150 151 print(iris_x.shape) # (150, 4) - 2?? 152 print(iris_y.shape) # (150,) - 1?? 153 154 155 # 5. DF ?? ?? : R ?? ?? [row, col1:col3] 156 ‘‘‘ 157 DF.ix[row index or label,col index or label] 158 - DF ???? ?? ?? index(??) or label(??) ?? 159 - ?? ???? ??(:) ?? ?? 160 - label? ???? label-based ?? 161 ‘‘‘ 162 print(‘frame‘) 163 print(frame) 164 ‘‘‘ 165 a b c d 166 0 0 1 2 3 167 1 4 5 6 7 168 2 8 9 10 11 169 ‘‘‘ 170 171 print(frame.ix[1]) # ? default 172 print(frame.ix[1, 2]) # 2? 3? - 6 173 print(frame.ix[:,‘d‘]) # d? ?? 174 print(frame.ix[:,‘b‘:‘c‘]) # b~c? ?? 175 176 177 print(len(iris)) # ??? ?? - 150 178 179 # 70% - 105, 305 - 45 180 181 import numpy as np 182 idx = np.random.choice(10, 5, replace=False) # 1~10 -> 5 random 183 print(idx) # [4 1 3 6 8] 184 185 186 idx = np.random.choice(len(iris), int(len(iris)*0.7), 187 replace=False) 188 print(idx, len(idx)) # 105 189 190 train_set = iris.ix[idx, :] 191 print(train_set.shape) # (105, 5)
03.Descriptive
1 # -*- coding: utf-8 -*- 2 """ 3 1. DataFrame ????? 4 2. ?? ?? ??? ?? 5 """ 6 7 import pandas as pd 8 9 10 product = pd.read_csv(‘../data/product.csv‘) 11 print(product.info()) 12 13 # ????? ??? 14 summary = product.describe() 15 print(summary) 16 17 # ?/? ??? ??? : axis=0 or 1 18 print(product.sum(axis = 0)) # ? ?? 19 ‘‘‘ 20 a 773 21 b 827 22 c 817 23 ‘‘‘ 24 print(product.sum(axis = 1)) # ? ?? 25 26 27 # ??? 28 print(product.var()) # ?? 29 print(product.std()) # ???? 30 31 # ??? 32 a_cnt = product[‘a‘].value_counts() 33 print(a_cnt) 34 ‘‘‘ 35 3 126 36 4 64 37 2 37 38 1 30 39 5 7 40 ‘‘‘ 41 42 # ?? ?? 43 b_uni = product[‘b‘].unique() 44 print(b_uni) # [4 3 2 5 1] 45 46 # ?? ?? ????( -1 < r < 1) 47 p_corr = product.corr() 48 print(p_corr) 49 ‘‘‘ 50 a b c 51 a 1.000000 0.499209 0.467145 52 b 0.499209 1.000000 0.766853 53 c 0.467145 0.766853 1.000000 54 ‘‘‘ 55 56 ac_corr = product[‘a‘].corr(product[‘c‘]) 57 print(ac_corr) # 0.4671449836008965 58 59 #?) iris 1 ~ 4 ?? -> ????(r) 60 cols = list(iris.columns) 61 print(cols) # 5? ?? list 62 iris_sub = iris[cols[:4]] 63 64 print(iris_sub.corr())
04.merge
1 # -*- coding: utf-8 -*- 2 """ 3 DataFrame marge 4 """ 5 6 import pandas as pd 7 8 wdbc = pd.read_csv("../data/wdbc_data.csv") 9 print(wdbc.info()) 10 ‘‘‘ 11 RangeIndex: 569 entries, 0 to 568 12 Data columns (total 32 columns): 13 ‘‘‘ 14 15 cols = list(wdbc.columns) 16 print(cols) 17 18 df1 = wdbc[cols[:16]] # 1~16 19 sid = wdbc[‘id‘] # id ?? 20 df2 = wdbc[cols[16:]] # 17~32 21 22 df2[‘id‘] = sid 23 24 print(df1.shape) # (569, 16) 25 print(df2.shape) # (569, 17) 26 27 28 # 1. id ???? DF ?? 29 df_merge = pd.merge(df1, df2) # id ??, how=‘inner‘ 30 print(df_merge.info()) 31 ‘‘‘ 32 <class ‘pandas.core.frame.DataFrame‘> 33 Int64Index: 569 entries, 0 to 568 34 Data columns (total 32 columns): 35 ‘‘‘ 36 37 # 2. ?? ?? df ??? 38 df1 = wdbc[cols[:16]] # 1~16 39 df2 = wdbc[cols[16:]] # 17~32 40 41 df_merge2 = pd.concat([df1, df2], axis=1) # ? ?? ?? 42 print(df_merge2.info()) 43 ‘‘‘ 44 <class ‘pandas.core.frame.DataFrame‘> 45 RangeIndex: 569 entries, 0 to 568 46 Data columns (total 32 columns): 47 ‘‘‘
05.timeSeries
1 # -*- coding: utf-8 -*- 2 """ 3 ??? ??? ??? 4 1. ???? ??(??? -> ???) 5 2. ??? ??? 6 3. ???? ?? 7 """ 8 9 import pandas as pd 10 from datetime import datetime # ???? ?? 11 12 cospi = pd.read_csv("../data/cospi.csv") 13 print(cospi.info()) 14 ‘‘‘ 15 RangeIndex: 247 entries, 0 to 246 16 Data columns (total 6 columns): 17 Date 247 non-null object 18 Open 247 non-null int64 19 High 247 non-null int64 20 Low 247 non-null int64 21 Close 247 non-null int64 22 Volume 247 non-null int64 23 ‘‘‘ 24 25 print(cospi.head()) 26 # 0 26-Feb-16 1180000 1187000 1172000 1172000 176906 27 # 26-Feb-16 -> 2016-2-26 28 29 # 1. ???? ??(??? -> ???) 30 Date = cospi[‘Date‘] # cospi.Date 31 kDate = [] # ?list 32 33 for d in Date : 34 kDate.append(datetime.strptime(d, "%d-%b-%y")) 35 36 print(kDate[:10]) 37 38 cospi[‘Date‘] = kDate # (??? -> ???) 39 print(cospi.head()) 40 41 42 # 2. ??? ??? 43 import matplotlib.pyplot as plt 44 45 # 1? ?? ????? 46 cospi[‘High‘].plot(title = "Trend line of High column") 47 plt.show() 48 49 # 2? ?? ????? 50 cospi[[‘High‘, ‘Low‘]].plot(title = "Trend line of High vs Low") 51 plt.show() 52 53 # 2. index ?? 54 print(cospi.index) 55 # RangeIndex(start=0, stop=247, step=1) 56 57 # index ?? -> Date ?? 58 new_cospi = cospi.set_index(‘Date‘) 59 print(new_cospi.head()) 60 61 # ??? ?? 62 print(new_cospi[‘2016‘]) 63 print(new_cospi[‘2015‘]) 64 65 # ?? ?? 66 print(new_cospi[‘2016-02‘]) 67 # ?? ?? 68 print(new_cospi[‘2016-02‘:‘2016-01‘]) 69 70 new_cospi_HL = new_cospi[[‘High‘, ‘Low‘]] 71 new_cospi_HL[‘2016‘].plot(title="title") 72 plt.show() 73 74 new_cospi_HL[‘2016-02‘].plot(title="title") 75 plt.show() 76 77 78 # 3. ???? ?? 79 80 # 5?, 10?, 20? 81 roll_mean5 = pd.Series.rolling(new_cospi.High, 82 window=5, center=False).mean() 83 print(roll_mean5) 84 85 roll_mean10 = pd.Series.rolling(new_cospi.High, 86 window=10, center=False).mean() 87 88 roll_mean20 = pd.Series.rolling(new_cospi.High, 89 window=20, center=False).mean() 90 91 # roll mean ??? 92 new_cospi.High.plot(color=‘orange‘, label=‘High column‘) 93 roll_mean5.plot(color=‘red‘, label=‘5day rolling mean‘) 94 roll_mean10.plot(color=‘green‘, label=‘10day rolling mean‘) 95 roll_mean20.plot(color=‘blue‘, label=‘20day rolling mean‘) 96 plt.legend(loc=‘best‘) 97 plt.show()
以上是关于01.pandas的主要内容,如果未能解决你的问题,请参考以下文章