python数据分析-03pandas库
Posted nikecode
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python数据分析-03pandas库相关的知识,希望对你有一定的参考价值。
#Series
import numpy as np
import pandas as pd
# s1 = pd.Series([1,2,3,4])
# print(s1)
# # 0 1
# # 1 2
# # 2 3
# # 3 4
# # dtype: int64
# print(s1.values) #[1 2 3 4]
# print(s1.index) #RangeIndex(start=0, stop=4, step=1)
#传入数组
# s2 = pd.Series(np.arange(10))
# print(s2)
# 0 0
# 1 1
# 2 2
# 3 3
# 4 4
# 5 5
# 6 6
# 7 7
# 8 8
# 9 9
# dtype: int32
#传入字典
# s3 = pd.Series("1":1,"2":2,"3":3)
# print(s3)
# # 1 1
# # 2 2
# # 3 3
# # dtype: int64
# print(s3.values) #[1 2 3]
# print(s3.index) #Index([‘1‘, ‘2‘, ‘3‘], dtype=‘object‘)
# s4 = pd.Series([1,2,3,4],index=[‘A‘,‘B‘,‘C‘,‘D‘])
# print(s4)
# # A 1
# # B 2
# # C 3
# # D 4
# # dtype: int64
# print(s4["A"]) #1
# print(s4[s4>2])
# # C 3
# # D 4
# # dtype: int64
# print(s4.to_dict()) #‘A‘: 1, ‘B‘: 2, ‘C‘: 3, ‘D‘: 4
# s4 = pd.Series([1,2,3,4],index=[‘A‘,‘B‘,‘C‘,‘D‘])
# print(s4)
# A 1
# B 2
# C 3
# D 4
# dtype: int64
# index_1 = [‘A‘,‘B‘,‘C‘,‘D‘,‘E‘]
# s5 = pd.Series(s4,index=index_1)
#print(s5)
# A 1.0
# B 2.0
# C 3.0
# D 4.0
# E NaN
# dtype: float64
#print(s5.isnull())
# A False
# B False
# C False
# D False
# E True
# dtype: bool
# print(s5.notnull())
# A True
# B True
# C True
# D True
# E False
# dtype: bool
# s5.name = ‘demo‘
# print(s5)
# A 1.0
# B 2.0
# C 3.0
# D 4.0
# E NaN
# Name: demo, dtype: float64
# s5.index.name = "demo index"
# print(s5)
# Name: demo, dtype: float64
# demo index
# A 1.0
# B 2.0
# C 3.0
# D 4.0
# E NaN
# Name: demo, dtype: float64
from pandas import Series,DataFrame
# import webbrowser
# link = "https://www.tiobe.com/tiobe-index/"
# webbrowser.open(link) #打开这个网页,然后使用鼠标进行复制操作
# df = pd.read_clipboard()#这里是读取鼠标复制的数据
#print(df)
# Apr 2019 Apr 2018 Change Programming Language Ratings Change.1
# 0 1 1 NaN Java 15.035% -0.74%
# 1 2 2 NaN C 14.076% +0.49%
# 2 3 3 NaN C++ 8.838% +1.62%
# 3 4 4 NaN Python 8.166% +2.36%
# 4 5 6 change Visual Basic .NET 5.795% +0.85%
# 5 6 5 change C# 3.515% -1.75%
# 6 7 8 change javascript 2.507% -0.99%
# 7 8 9 change SQL 2.272% -0.38%
# 8 9 7 change php 2.239% -1.98%
# 9 10 14 change Assembly language 1.710% +0.05%
# 10 11 18 change Objective-C 1.505% +0.25%
# 11 12 17 change MATLAB 1.285% -0.17%
# 12 13 10 change Ruby 1.277% -0.74%
# 13 14 16 change Perl 1.269% -0.26%
# 14 15 11 change Delphi/Object Pascal 1.264% -0.70%
# 15 16 12 change R 1.181% -0.63%
# 16 17 13 change Visual Basic 1.060% -0.74%
# 17 18 19 change Go 1.009% -0.17%
# 18 19 15 change Swift 0.978% -0.56%
# 19 20 68 change Groovy NaN NaN
#print(type(df)) #<class ‘pandas.core.frame.DataFrame‘>
#print(df.columns)
# Index([‘Apr 2019‘, ‘Apr 2018‘, ‘Change‘, ‘Programming Language‘, ‘Ratings‘,
# ‘Change.1‘],
# dtype=‘object‘)
#print(df.Ratings) #直接获取Ratings列 或者print(df["Ratings"])
# 0 15.035%
# 1 14.076%
# 2 8.838%
# 3 8.166%
# 4 5.795%
# 5 3.515%
# 6 2.507%
# 7 2.272%
# 8 2.239%
# Name: Ratings, dtype: object
#print(DataFrame(df,columns=["Programming Language","Ratings"]))#获取多列
# Programming Language Ratings
# 0 Java 15.035%
# 1 C 14.076%
# 2 C++ 8.838%
# 3 Python 8.166%
# 4 Visual Basic .NET 5.795%
# 5 C# 3.515%
# 6 JavaScript 2.507%
# 7 SQL 2.272%
# 8 PHP 2.239%
#填充一列新列:Apr 2020,数据是用NaN填充的
#df_new = DataFrame(df,columns=["Programming Language","Ratings","Apr 2020"])
# print(df_new)
# Programming Language Ratings Apr 2020
# 0 Java 15.035% NaN
# 1 C 14.076% NaN
# 2 C++ 8.838% NaN
# 3 Python 8.166% NaN
# 4 Visual Basic .NET 5.795% NaN
# 5 C# 3.515% NaN
# 6 JavaScript 2.507% NaN
# 7 SQL 2.272% NaN
# 8 PHP 2.239% NaN
# df_new["Apr 2020"] = range(9)
# print(df_new)#给新的一列赋值
# Programming Language Ratings Apr 2020
# 0 Java 15.035% 0
# 1 C 14.076% 1
# 2 C++ 8.838% 2
# 3 Python 8.166% 3
# 4 Visual Basic .NET 5.795% 4
# 5 C# 3.515% 5
# 6 JavaScript 2.507% 6
# 7 SQL 2.272% 7
# 8 PHP 2.239% 8
#把数组赋值过来
# df_new["Apr 2020"] = np.arange(9)
# print(df_new)
# Programming Language Ratings Apr 2020
# 0 Java 15.035% 0
# 1 C 14.076% 1
# 2 C++ 8.838% 2
# 3 Python 8.166% 3
# 4 Visual Basic .NET 5.795% 4
# 5 C# 3.515% 5
# 6 JavaScript 2.507% 6
# 7 SQL 2.272% 7
# 8 PHP 2.239% 8
#由于其本身每列都是Series,所以可以用Series赋值
# df_new["Apr 2020"] = pd.Series(np.arange(9))
# print(df_new)
# Programming Language Ratings Apr 2020
# 0 Java 15.035% 0
# 1 C 14.076% 1
# 2 C++ 8.838% 2
# 3 Python 8.166% 3
# 4 Visual Basic .NET 5.795% 4
# 5 C# 3.515% 5
# 6 JavaScript 2.507% 6
# 7 SQL 2.272% 7
# 8 PHP 2.239% 8
#只赋值给某一行
# df_new["Apr 2020"] = pd.Series([100,200],index=[1,2])
# print(df_new)
# Programming Language Ratings Apr 2020
# 0 Java 15.035% NaN
# 1 C 14.076% 100.0
# 2 C++ 8.838% 200.0
# 3 Python 8.166% NaN
# 4 Visual Basic .NET 5.795% NaN
# 5 C# 3.515% NaN
# 6 JavaScript 2.507% NaN
# 7 SQL 2.272% NaN
# 8 PHP 2.239% NaN
data = ‘country‘:[‘belgium‘,‘India‘,‘Brazil‘],
‘Capital‘:[‘Brussels‘,‘New Delhi‘,‘Brasilia‘],
‘Population‘:[11190846,1303171035,207847528]
# s1 = pd.Series(data[‘country‘])
# print(s1)
# 0 belgium
# 1 India
# 2 Brazil
# dtype: object
# print(s1.values)#[‘belgium‘ ‘India‘ ‘Brazil‘]
# print(s1.index)#RangeIndex(start=0, stop=3, step=1)
# s1 = pd.Series(data[‘country‘],index=[‘A‘,‘B‘,‘C‘])
# print(s1)
# A belgium
# B India
# C Brazil
# dtype: object
#DataFrame
# df1 = DataFrame(data)
# print(df1)
# country Capital Population
# 0 belgium Brussels 11190846
# 1 India New Delhi 1303171035
# 2 Brazil Brasilia 207847528
# cou = df1["country"]
# print(type(cou)) #<class ‘pandas.core.series.Series‘>
# print(df1.iterrows()) #<generator object DataFrame.iterrows at 0x000000000B6F4DE0>
# for row in df1.iterrows():
# print(row)
# # (0, country belgium
# # Capital Brussels
# # Population 11190846
# # Name: 0, dtype: object)
# print(row[0])#0
# print(row[1])
# # country belgium
# # Capital Brussels
# # Population 11190846
# # Name: 0, dtype: object
# print(type(row[1])) #<class ‘pandas.core.series.Series‘>
# break
# s1 = pd.Series(data["country"])
# s2 = pd.Series(data["Capital"])
# s3 = pd.Series(data["Population"])
#print(s1)
# 0 belgium
# 1 India
# 2 Brazil
# dtype: object
# df_new = DataFrame([s1,s2,s3],index=[‘country‘,‘Capital‘,‘Population‘])
# df_new = df_new.T
# print(df_new)
# country Capital Population
# 0 belgium Brussels 11190846
# 1 India New Delhi 1303171035
# 2 Brazil Brasilia 207847528
#DataFrame的IO操作
# import webbrowser
# link = "http://pandas.pydata.org/pandas-docs/version/0.20/io.html"
# webbrowser.open(link)
#
# df1 = pd.read_clipboard()
#print(df1)
# Format Type Data Description Reader Writer
# 0 text CSV read_csv to_csv
# 1 text JSON read_json to_json
# 2 text HTML read_html to_html
# 3 text Local clipboard read_clipboard to_clipboard
# 4 binary MS Excel read_excel to_excel
# 5 binary HDF5 Format read_hdf to_hdf
# 6 binary Feather Format read_feather to_feather
# 7 binary Msgpack read_msgpack to_msgpack
# 8 binary Stata read_stata to_stata
# 9 binary SAS read_sas
# 10 binary Python Pickle Format read_pickle to_pickle
# 11 SQL SQL read_sql to_sql
# 12 SQL Google Big Query read_gbq to_gbq
# df1.to_csv("df1.csv",index=False)#去掉前面的index
# df2 = pd.read_csv("df1.csv")
# print(df2)
# Format Type Data Description Reader Writer
# 0 text CSV read_csv to_csv
# 1 text JSON read_json to_json
# 2 text HTML read_html to_html
# 3 text Local clipboard read_clipboard to_clipboard
# 4 binary MS Excel read_excel to_excel
# 5 binary HDF5 Format read_hdf to_hdf
# 6 binary Feather Format read_feather to_feather
# 7 binary Msgpack read_msgpack to_msgpack
# 8 binary Stata read_stata to_stata
# 9 binary SAS read_sas
# 10 binary Python Pickle Format read_pickle to_pickle
# 11 SQL SQL read_sql to_sql
# 12 SQL Google Big Query read_gbq to_gbq
# print(df1.to_json())
# "Format":"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL","Type":"0":"CSV","1":"JSON","2":"HTML","3":"Local","4":"MS","5":"HDF5","6":"Feather","7":"Msgpack","8":"Stata","9":"SAS","10":"Python","11":"SQL","12":"Google","Data":"0":"read_csv","1":"read_json","2":"read_html","3":"clipboard","4":"Excel","5":"Format","6":"Format","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"Pickle","11":"read_sql","12":"Big","Description":"0":"to_csv","1":"to_json","2":"to_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"to_msgpack","8":"to_stata","9":null,"10":"Format","11":"to_sql","12":"Query","Reader":"0":null,"1":null,"2":null,"3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":null,"8":null,"9":null,"10":"read_pickle","11":null,"12":"read_gbq","Writer":"0":null,"1":null,"2":null,"3":null,"4":null,"5":null,"6":null,"7":null,"8":null,"9":null,"10":"to_pickle","11":null,"12":"to_gbq"
# print(pd.read_json(df1.to_json()))
# Format Type Data Description Reader Writer
# 0 text CSV read_csv to_csv
# 1 text JSON read_json to_json
# 10 binary Python Pickle Format read_pickle to_pickle
# 11 SQL SQL read_sql to_sql
# 12 SQL Google Big Query read_gbq to_gbq
# 2 text HTML read_html to_html
# 3 text Local clipboard read_clipboard to_clipboard
# 4 binary MS Excel read_excel to_excel
# 5 binary HDF5 Format read_hdf to_hdf
# 6 binary Feather Format read_feather to_feather
# 7 binary Msgpack read_msgpack to_msgpack
# 8 binary Stata read_stata to_stata
# 9 binary SAS read_sas
#假设有movie_metadata.csv 文件
# imbd = pd.read_csv("movie_metadata.csv")
# print(imbd.shape)#(5043,28)
# print(imbd.head())
# print(imbd[["color","director_name"]])#q取出两列数据
# sub_df = imbd["director_name","movie_title","imbd_score"]
# print(sub_df.head(5))
# print(sub_df.iloc[10:20,:])
# print(sub_df.iloc[10:20,0:2])
# print(sub_df.loc[10:20,:])#和iloc类似,只是多了第20行
# print(sub_df.loc[10:20,:"director_name"]) #可以使使用key键
#Series Reindex
#s1 = Series([1,2,3,4],index=[‘A‘,‘B‘,‘C‘,‘D‘])
# print(s1)
# A 1
# B 2
# C 3
# D 4
# dtype: int64
#print(s1.reindex(index=[‘A‘,‘B‘,‘C‘,‘D‘,‘E‘]))
# A 1.0
# B 2.0
# C 3.0
# D 4.0
# E NaN
# dtype: float64
#print(s1.reindex(index=[‘A‘,‘B‘,‘C‘,‘D‘,‘E‘],fill_value=10))
# A 1
# B 2
# C 3
# D 4
# E 10
# dtype: int64
#s2 = Series([‘A‘,‘B‘,‘C‘],index=[1,5,10])
#print(s2)
# 1 A
# 5 B
# 10 C
# dtype: object
#print(s2.reindex(index=range(15)))
# 1 A
# 5 B
# 10 C
# dtype: object
# 0 NaN
# 1 A
# 2 NaN
# 3 NaN
# 4 NaN
# 5 B
# 6 NaN
# 7 NaN
# 8 NaN
# 9 NaN
# 10 C
# 11 NaN
# 12 NaN
# 13 NaN
# 14 NaN
# dtype: object
#print(s2.reindex(index=range(15),method="ffill"))
# 0 NaN
# 1 A
# 2 A
# 3 A
# 4 A
# 5 B
# 6 B
# 7 B
# 8 B
# 9 B
# 10 C
# 11 C
# 12 C
# 13 C
# 14 C
# dtype: object
#Reindex dataframe
# df1 = DataFrame(np.random.rand(25).reshape(5,5))
# print(df1)
# 0 1 2 3 4
# 0 0.150685 0.741189 0.642348 0.625132 0.318640
# 1 0.781998 0.793684 0.434840 0.053550 0.076352
# 2 0.657116 0.261819 0.089875 0.298170 0.035670
# 3 0.408057 0.550972 0.298262 0.734598 0.920229
# 4 0.707607 0.163687 0.861138 0.553325 0.439473
# df2 = DataFrame(np.random.rand(25).reshape(5,5),index=[‘A‘,‘B‘,‘D‘,‘E‘,‘F‘],columns=[‘c1‘,‘c2‘,‘c3‘,‘c4‘,‘c5‘])
# print(df2)
# c1 c2 c3 c4 c5
# A 0.096956 0.687012 0.242486 0.106347 0.951611
# B 0.534206 0.555345 0.743860 0.156659 0.228296
# D 0.963385 0.648523 0.603671 0.904279 0.161911
# E 0.549797 0.987869 0.048364 0.706606 0.820717
# F 0.003817 0.923006 0.611485 0.986054 0.160444
# print(df2.reindex(index=[‘A‘,‘B‘,‘D‘,‘C‘,‘E‘,‘F‘]))
# c1 c2 c3 c4 c5
# A 0.745011 0.621461 0.288680 0.177793 0.013119
# B 0.431538 0.170305 0.780363 0.007156 0.139781
# D 0.663396 0.807862 0.732135 0.347896 0.959864
# C NaN NaN NaN NaN NaN
# E 0.145247 0.191087 0.811372 0.648703 0.697846
# F 0.742532 0.439197 0.612185 0.114661 0.221951
# print(df2.reindex(columns=[‘c1‘,‘c2‘,‘c3‘,‘c4‘,‘c5‘,‘c6‘]))
# c1 c2 c3 c4 c5 c6
# A 0.287383 0.910655 0.418470 0.613704 0.200391 NaN
# B 0.942793 0.389105 0.619344 0.076861 0.474860 NaN
# D 0.945629 0.308200 0.165710 0.152989 0.552817 NaN
# E 0.876477 0.138687 0.838985 0.656992 0.773661 NaN
# F 0.866165 0.539998 0.500313 0.540542 0.002450 NaN
# print(df2.reindex(index=[‘A‘,‘B‘,‘D‘,‘C‘,‘E‘,‘F‘],columns=[‘c1‘,‘c2‘,‘c3‘,‘c4‘,‘c5‘,‘c6‘]))
# c1 c2 c3 c4 c5 c6
# A 0.978832 0.807321 0.366297 0.148317 0.308838 NaN
# B 0.905668 0.114278 0.368676 0.428269 0.162910 NaN
# D 0.930796 0.963658 0.902773 0.584296 0.295554 NaN
# C NaN NaN NaN NaN NaN NaN
# E 0.101119 0.000268 0.301075 0.697321 0.121599 NaN
# F 0.402271 0.660168 0.477529 0.590062 0.459596 NaN
# print(df2.reindex(index=[‘A‘,‘B‘]))
# c1 c2 c3 c4 c5
# A 0.855483 0.462398 0.282791 0.454249 0.027320
# B 0.223694 0.827418 0.368981 0.867265 0.471167
# print(df2.drop("A"))
# c1 c2 c3 c4 c5
# B 0.047756 0.880659 0.744061 0.012340 0.216161
# D 0.603093 0.769085 0.526477 0.187897 0.991472
# E 0.159034 0.909088 0.765743 0.428868 0.972190
# F 0.239292 0.982104 0.802697 0.848463 0.503050
# print(df2.drop("A",axis=0))
# c1 c2 c3 c4 c5
# B 0.474883 0.859859 0.594369 0.077369 0.616871
# D 0.562033 0.190256 0.882217 0.810458 0.855765
# E 0.545617 0.872125 0.406509 0.544556 0.718795
# F 0.944125 0.268808 0.070181 0.351121 0.040010
# print(df2.drop("c1",axis=1))
# c2 c3 c4 c5
# A 0.404537 0.646484 0.319498 0.818558
# B 0.231232 0.132706 0.851948 0.061789
# D 0.067037 0.789874 0.368729 0.761373
# E 0.176873 0.294302 0.818214 0.284220
# F 0.378809 0.835109 0.124004 0.857353
以上是关于python数据分析-03pandas库的主要内容,如果未能解决你的问题,请参考以下文章
连接大型 CSV 文件中单词的最有效方法:pandas 还是 Python 标准库? [复制]