数据分析处理库pandas
Posted lxl616
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了数据分析处理库pandas相关的知识,希望对你有一定的参考价值。
# pandas_1 import pandas food_info = pandas.read_csv("food_info.csv") #print(type(food_info)) print (food_info.dtypes) ‘‘‘ NDB_No int64 Shrt_Desc object Water_(g) float64 Energ_Kcal int64 Protein_(g) float64 Lipid_Tot_(g) float64 Ash_(g) float64 Carbohydrt_(g) float64 Fiber_TD_(g) float64 Sugar_Tot_(g) float64 Calcium_(mg) float64 Iron_(mg) float64 Magnesium_(mg) float64 Phosphorus_(mg) float64 Potassium_(mg) float64 Sodium_(mg) float64 Zinc_(mg) float64 Copper_(mg) float64 Manganese_(mg) float64 Selenium_(mcg) float64 Vit_C_(mg) float64 Thiamin_(mg) float64 Riboflavin_(mg) float64 Niacin_(mg) float64 Vit_B6_(mg) float64 Vit_B12_(mcg) float64 Vit_A_IU float64 Vit_A_RAE float64 Vit_E_(mg) float64 Vit_D_mcg float64 Vit_D_IU float64 Vit_K_(mcg) float64 FA_Sat_(g) float64 FA_Mono_(g) float64 FA_Poly_(g) float64 Cholestrl_(mg) float64 dtype: object ‘‘‘ #first_rows = food_info.head() #print first_rows #print(food_info.head(3)) #print food_info.columns #print food_info.shape #pandas uses zero-indexing #Series object representing the row at index 0. #print food_info.loc[0] # Series object representing the seventh row. #food_info.loc[6] # Will throw an error: "KeyError: ‘the label [8620] is not in the [index]‘" #food_info.loc[8620] #The object dtype is equivalent to a string in Python #object - For string values #int - For integer values #float - For float values #datetime - For time values #bool - For Boolean values #print(food_info.dtypes) # Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6. #food_info.loc[3:6] # Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work. # Method 1 #two_five_ten = [2,5,10] #food_info.loc[two_five_ten] # Method 2 #food_info.loc[[2,5,10]] # Series object representing the "NDB_No" column. #ndb_col = food_info["NDB_No"] #print ndb_col # Alternatively, you can access a column by passing in a string variable. #col_name = "NDB_No" #ndb_col = food_info[col_name] #columns = ["Zinc_(mg)", "Copper_(mg)"] #zinc_copper = food_info[columns] #print zinc_copper #print zinc_copper # Skipping the assignment. #zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]] #print(food_info.columns) #print(food_info.head(2)) col_names = food_info.columns.tolist() #print col_names gram_columns = [] for c in col_names: if c.endswith("(g)"): gram_columns.append(c) gram_df = food_info[gram_columns] print(gram_df.head(3)) ‘‘‘ Water_(g) Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) 0 15.87 0.85 81.11 2.11 0.06 1 15.87 0.85 81.11 2.11 0.06 2 0.24 0.28 99.48 0.00 0.00 Fiber_TD_(g) Sugar_Tot_(g) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) 0 0.0 0.06 51.368 21.021 3.043 1 0.0 0.06 50.489 23.426 3.012 2 0.0 0.00 61.924 28.732 3.694 ‘‘‘
# pandas_2 import pandas food_info = pandas.read_csv("food_info.csv") col_names = food_info.columns.tolist() print(col_names) print(food_info.head(3)) ‘‘‘ [‘NDB_No‘, ‘Shrt_Desc‘, ‘Water_(g)‘, ‘Energ_Kcal‘, ‘Protein_(g)‘, ‘Lipid_Tot_(g)‘, ‘Ash_(g)‘, ‘Carbohydrt_(g)‘, ‘Fiber_TD_(g)‘, ‘Sugar_Tot_(g)‘, ‘Calcium_(mg)‘, ‘Iron_(mg)‘, ‘Magnesium_(mg)‘, ‘Phosphorus_(mg)‘, ‘Potassium_(mg)‘, ‘Sodium_(mg)‘, ‘Zinc_(mg)‘, ‘Copper_(mg)‘, ‘Manganese_(mg)‘, ‘Selenium_(mcg)‘, ‘Vit_C_(mg)‘, ‘Thiamin_(mg)‘, ‘Riboflavin_(mg)‘, ‘Niacin_(mg)‘, ‘Vit_B6_(mg)‘, ‘Vit_B12_(mcg)‘, ‘Vit_A_IU‘, ‘Vit_A_RAE‘, ‘Vit_E_(mg)‘, ‘Vit_D_mcg‘, ‘Vit_D_IU‘, ‘Vit_K_(mcg)‘, ‘FA_Sat_(g)‘, ‘FA_Mono_(g)‘, ‘FA_Poly_(g)‘, ‘Cholestrl_(mg)‘] NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) 0 1001 BUTTER WITH SALT 15.87 717 0.85 1 1002 BUTTER WHIPPED WITH SALT 15.87 717 0.85 2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28 Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ... 0 81.11 2.11 0.06 0.0 0.06 ... 1 81.11 2.11 0.06 0.0 0.06 ... 2 99.48 0.00 0.00 0.0 0.00 ... Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg) 0 2499.0 684.0 2.32 1.5 60.0 7.0 1 2499.0 684.0 2.32 1.5 60.0 7.0 2 3069.0 840.0 2.80 1.8 73.0 8.6 FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg) 0 51.368 21.021 3.043 215.0 1 50.489 23.426 3.012 219.0 2 61.924 28.732 3.694 256.0 [3 rows x 36 columns] ‘‘‘ #print food_info["Iron_(mg)"] #div_1000 = food_info["Iron_(mg)"] / 1000 #print div_1000 # Adds 100 to each value in the column and returns a Series object. #add_100 = food_info["Iron_(mg)"] + 100 # Subtracts 100 from each value in the column and returns a Series object. #sub_100 = food_info["Iron_(mg)"] - 100 # Multiplies each value in the column by 2 and returns a Series object. #mult_2 = food_info["Iron_(mg)"]*2 #It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"] water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"] iron_grams = food_info["Iron_(mg)"] / 1000 food_info["Iron_(g)"] = iron_grams #Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g)) weighted_protein = food_info["Protein_(g)"] * 2 weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"] initial_rating = weighted_protein + weighted_fat # the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79 #For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result, #due to the scale of the values # The largest value in the "Energ_Kcal" column. max_calories = food_info["Energ_Kcal"].max() # Divide the values in "Energ_Kcal" by the largest value. normalized_calories = food_info["Energ_Kcal"] / max_calories normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max() normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max() food_info["Normalized_Protein"] = normalized_protein food_info["Normalized_Fat"] = normalized_fat #By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame # Sorts the DataFrame in-place, rather than returning a new DataFrame. #print food_info["Sodium_(mg)"] food_info.sort_values("Sodium_(mg)", inplace=True) print (food_info["Sodium_(mg)"]) #Sorts by descending order, rather than ascending. food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False) print (food_info["Sodium_(mg)"]) ‘‘‘ 760 0.0 758 0.0 405 0.0 761 0.0 2269 0.0 ... 8184 NaN 8185 NaN 8195 NaN 8251 NaN 8267 NaN Name: Sodium_(mg), Length: 8618, dtype: float64 276 38758.0 5814 27360.0 6192 26050.0 1242 26000.0 1245 24000.0 ... 8184 NaN 8185 NaN 8195 NaN 8251 NaN 8267 NaN Name: Sodium_(mg), Length: 8618, dtype: float64 ‘‘‘
# pandas_3 import pandas as pd import numpy as np titanic_survival = pd.read_csv("titanic_train.csv") titanic_survival.head() #The Pandas library uses NaN, which stands for "not a number", to indicate a missing value. #we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values age = titanic_survival["Age"] # print(age.loc[0:10]) age_is_null = pd.isnull(age) # print (age_is_null) age_null_true = age[age_is_null] print (age_null_true) age_null_count = len(age_null_true) print(age_null_count) ‘‘‘ 5 NaN 17 NaN 19 NaN 26 NaN 28 NaN .. 859 NaN 863 NaN 868 NaN 878 NaN 888 NaN Name: Age, Length: 177, dtype: float64 177 ‘‘‘ #The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"]) print (mean_age) ‘‘‘ nan ‘‘‘ #we have to filter out the missing values before we calculate the mean. good_ages = titanic_survival["Age"][age_is_null == False] #print good_ages correct_mean_age = sum(good_ages) / len(good_ages) print(correct_mean_age) ‘‘‘ 29.69911764705882 ‘‘‘ # missing data is so common that many pandas methods automatically filter for it correct_mean_age = titanic_survival["Age"].mean() print(correct_mean_age) ‘‘‘ 29.69911764705882 ‘‘‘ #mean fare for each class passenger_classes = [1, 2, 3] fares_by_class = {} for this_class in passenger_classes: pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] pclass_fares = pclass_rows["Fare"] fare_for_class = pclass_fares.mean() fares_by_class[this_class] = fare_for_class print (fares_by_class) ‘‘‘ {1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993} ‘‘‘ #index tells the method which column to group by #values is the column that we want to apply the calculation to #aggfunc specifies the calculation we want to perform passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean) print (passenger_survival) ‘‘‘ Survived Pclass 1 0.629630 2 0.472826 3 0.242363 ‘‘‘ passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age") print(passenger_age) ‘‘‘ Age Pclass 1 38.233441 2 29.877630 3 25.140620 ‘‘‘ port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum) print(port_stats) ‘‘‘ Fare Survived Embarked C 10072.2962 93 Q 1022.2543 30 S 17439.3988 217 ‘‘‘ #specifying axis=1 or axis=‘columns‘ will drop any columns that have null values drop_na_columns = titanic_survival.dropna(axis=1) new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"]) print (drop_na_columns.shape) ‘‘‘ (891, 9) ‘‘‘ row_index_83_age = titanic_survival.loc[83,"Age"] row_index_1000_pclass = titanic_survival.loc[766,"Pclass"] print(row_index_83_age) print(row_index_1000_pclass) ‘‘‘ 28.0 1 ‘‘‘ new_titanic_survival = titanic_survival.sort_values("Age",ascending=False) print (new_titanic_survival[0:10]) titanic_reindexed = new_titanic_survival.reset_index(drop=True) # (drop=True) 表示原来的索引不要了,生成新的索引 print(titanic_reindexed.iloc[0:10]) ‘‘‘ PassengerId Survived Pclass Name 630 631 1 1 Barkworth, Mr. Algernon Henry Wilson 851 852 0 3 Svensson, Mr. Johan 493 494 0 1 Artagaveytia, Mr. Ramon 96 97 0 1 Goldschmidt, Mr. George B 116 117 0 3 Connors, Mr. Patrick 672 673 0 2 Mitchell, Mr. Henry Michael 745 746 0 1 Crosby, Capt. Edward Gifford 33 34 0 2 Wheadon, Mr. Edward H 54 55 0 1 Ostby, Mr. Engelhart Cornelius 280 281 0 3 Duane, Mr. Frank Sex Age SibSp Parch Ticket Fare Cabin Embarked 630 male 80.0 0 0 27042 30.0000 A23 S 851 male 74.0 0 0 347060 7.7750 NaN S 493 male 71.0 0 0 PC 17609 49.5042 NaN C 96 male 71.0 0 0 PC 17754 34.6542 A5 C 116 male 70.5 0 0 370369 7.7500 NaN Q 672 male 70.0 0 0 C.A. 24580 10.5000 NaN S 745 male 70.0 1 1 WE/P 5735 71.0000 B22 S 33 male 66.0 0 0 C.A. 24579 10.5000 NaN S 54 male 65.0 0 1 113509 61.9792 B30 C 280 male 65.0 0 0 336439 7.7500 NaN Q PassengerId Survived Pclass Name Sex 0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male 1 852 0 3 Svensson, Mr. Johan male 2 494 0 1 Artagaveytia, Mr. Ramon male 3 97 0 1 Goldschmidt, Mr. George B male 4 117 0 3 Connors, Mr. Patrick male 5 673 0 2 Mitchell, Mr. Henry Michael male 6 746 0 1 Crosby, Capt. Edward Gifford male 7 34 0 2 Wheadon, Mr. Edward H male 8 55 0 1 Ostby, Mr. Engelhart Cornelius male 9 281 0 3 Duane, Mr. Frank male Age SibSp Parch Ticket Fare Cabin Embarked 0 80.0 0 0 27042 30.0000 A23 S 1 74.0 0 0 347060 7.7750 NaN S 2 71.0 0 0 PC 17609 49.5042 NaN C 3 71.0 0 0 PC 17754 34.6542 A5 C 4 70.5 0 0 370369 7.7500 NaN Q 5 70.0 0 0 C.A. 24580 10.5000 NaN S 6 70.0 1 1 WE/P 5735 71.0000 B22 S 7 66.0 0 0 C.A. 24579 10.5000 NaN S 8 65.0 0 1 113509 61.9792 B30 C 9 65.0 0 0 336439 7.7500 NaN Q ‘‘‘ # This function returns the hundredth item from a series def hundredth_row(column): # Extract the hundredth item hundredth_item = column.iloc[99] return hundredth_item # Return the hundredth item from each column hundredth_row = titanic_survival.apply(hundredth_row) print (hundredth_row) ‘‘‘ PassengerId 100 Survived 0 Pclass 2 Name Kantor, Mr. Sinai Sex male Age 34 SibSp 1 Parch 0 Ticket 244367 Fare 26 Cabin NaN Embarked S dtype: object ‘‘‘ # 判断每列中缺失值个数 def not_null_count(column): column_null = pd.isnull(column) null = column[column_null] return len(null) column_null_count = titanic_survival.apply(not_null_count) print (column_null_count) ‘‘‘ PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 age_labels 0 dtype: int64 ‘‘‘ # len(titanic_survival[pd.isnull(titanic_survival)]) # titanic_survival #By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns. def which_class(row): pclass = row[‘Pclass‘] if pd.isnull(pclass): return "Unknown" elif pclass == 1: return "First Class" elif pclass == 2: return "Second Class" elif pclass == 3: return "Third Class" classes = titanic_survival.apply(which_class, axis=1) print (classes) ‘‘‘ 0 Third Class 1 First Class 2 Third Class 3 First Class 4 Third Class ... 886 Second Class 887 First Class 888 Third Class 889 First Class 890 Third Class Length: 891, dtype: object ‘‘‘ def is_minor(row): if row["Age"] < 18: return True else: return False minors = titanic_survival.apply(is_minor, axis=1) #print minors # 离散化 def generate_age_label(row): age = row["Age"] if pd.isnull(age): return "unknown" elif age < 18: return "minor" else: return "adult" age_labels = titanic_survival.apply(generate_age_label, axis=1) print (age_labels) ‘‘‘ 0 adult 1 adult 2 adult 3 adult 4 adult ... 886 adult 887 adult 888 unknown 889 adult 890 adult Length: 891, dtype: object ‘‘‘ titanic_survival[‘age_labels‘] = age_labels age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived") print(age_group_survival) ‘‘‘ Survived age_labels adult 0.381032 minor 0.539823 unknown 0.293785 ‘‘‘
# pandas_4 #Series (collection of values) #DataFrame (collection of Series objects) #Panel (collection of DataFrame objects) #A Series object can hold many data types, including #float - for representing float values #int - for representing integer values #bool - for representing Boolean values #datetime64[ns] - for representing date & time, without time-zone #datetime64[ns, tz] - for representing date & time, with time-zone #timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.) #category - for representing categorical values #object - for representing String values #FILM - film name #RottenTomatoes - Rotten Tomatoes critics average score #RottenTomatoes_User - Rotten Tomatoes user average score #RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system) #RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system) #Metacritic - Metacritic critics average score #Metacritic_User - Metacritic user average score import pandas as pd fandango = pd.read_csv(‘fandango_score_comparison.csv‘) series_film = fandango[‘FILM‘] print(type(series_film)) print(‘=========================‘) print(series_film[0:5]) print(‘=========================‘) series_rt = fandango[‘RottenTomatoes‘] print (series_rt[0:5]) ‘‘‘ <class ‘pandas.core.series.Series‘> ========================= 0 Avengers: Age of Ultron (2015) 1 Cinderella (2015) 2 Ant-Man (2015) 3 Do You Believe? (2015) 4 Hot Tub Time Machine 2 (2015) Name: FILM, dtype: object ========================= 0 74 1 85 2 80 3 18 4 14 Name: RottenTomatoes, dtype: int64 ‘‘‘ fandango.head() ‘‘‘ FILM RottenTomatoes RottenTomatoes_User Metacritic Metacritic_User IMDB Fandango_Stars Fandango_Ratingvalue RT_norm RT_user_norm ... IMDB_norm RT_norm_round RT_user_norm_round Metacritic_norm_round Metacritic_user_norm_round IMDB_norm_round Metacritic_user_vote_count IMDB_user_vote_count Fandango_votes Fandango_Difference 0 Avengers: Age of Ultron (2015) 74 86 66 7.1 7.8 5.0 4.5 3.70 4.3 ... 3.90 3.5 4.5 3.5 3.5 4.0 1330 271107 14846 0.5 1 Cinderella (2015) 85 80 67 7.5 7.1 5.0 4.5 4.25 4.0 ... 3.55 4.5 4.0 3.5 4.0 3.5 249 65709 12640 0.5 2 Ant-Man (2015) 80 90 64 8.1 7.8 5.0 4.5 4.00 4.5 ... 3.90 4.0 4.5 3.0 4.0 4.0 627 103660 12055 0.5 3 Do You Believe? (2015) 18 84 22 4.7 5.4 5.0 4.5 0.90 4.2 ... 2.70 1.0 4.0 1.0 2.5 2.5 31 3136 1793 0.5 4 Hot Tub Time Machine 2 (2015) 14 28 29 3.4 5.1 3.5 3.0 0.70 1.4 ... 2.55 0.5 1.5 1.5 1.5 2.5 88 19560 1021 0.5 5 rows × 22 columns ‘‘‘ # fandango.loc[[0,1],[‘FILM‘,‘RottenTomatoes‘]] # fandango.FILM[0] fandango.iloc[1,2] ‘‘‘ 80 ‘‘‘ # Import the Series object from pandas from pandas import Series film_names = series_film.values print (type(film_names)) # print (film_names) #print film_names rt_scores = series_rt.values #print (rt_scores) series_custom = Series(rt_scores , index=film_names) series_custom[[‘Minions (2015)‘, ‘Leviathan (2014)‘]] ‘‘‘ <class ‘numpy.ndarray‘> Minions (2015) 54 Leviathan (2014) 99 dtype: int64 ‘‘‘ # int index is also aviable series_custom = Series(rt_scores , index=film_names) print(series_custom[[‘Minions (2015)‘, ‘Leviathan (2014)‘]]) fiveten = series_custom[5:10] print(fiveten) ‘‘‘ Minions (2015) 54 Leviathan (2014) 99 dtype: int64 The Water Diviner (2015) 63 Irrational Man (2015) 42 Top Five (2014) 86 Shaun the Sheep Movie (2015) 99 Love & Mercy (2015) 89 dtype: int64 ‘‘‘ original_index = series_custom.index.tolist() # print(original_index) sorted_index = sorted(original_index) sorted_by_index = series_custom.reindex(sorted_index) print (sorted_by_index) ‘‘‘ ‘71 (2015) 97 5 Flights Up (2015) 52 A Little Chaos (2015) 40 A Most Violent Year (2014) 90 About Elly (2015) 97 .. What We Do in the Shadows (2015) 96 When Marnie Was There (2015) 89 While We‘re Young (2015) 83 Wild Tales (2014) 96 Woman in Gold (2015) 52 Length: 146, dtype: int64 ‘‘‘ sc2 = series_custom.sort_index() sc3 = series_custom.sort_values() #print(sc2[0:10]) print(sc3[0:10]) ‘‘‘ Paul Blart: Mall Cop 2 (2015) 5 Hitman: Agent 47 (2015) 7 Hot Pursuit (2015) 8 Fantastic Four (2015) 9 Taken 3 (2015) 9 The Boy Next Door (2015) 10 The Loft (2015) 11 Unfinished Business (2015) 11 Mortdecai (2015) 12 Seventh Son (2015) 12 dtype: int64 ‘‘‘ #The values in a Series object are treated as an ndarray, the core data type in NumPy import numpy as np # Add each value with each other print (np.add(series_custom, series_custom)) # Apply sine function to each value np.sin(series_custom) # Return the highest value (will return a single value not a Series) np.max(series_custom) ‘‘‘ Avengers: Age of Ultron (2015) 148 Cinderella (2015) 170 Ant-Man (2015) 160 Do You Believe? (2015) 36 Hot Tub Time Machine 2 (2015) 28 ... Mr. Holmes (2015) 174 ‘71 (2015) 194 Two Days, One Night (2014) 194 Gett: The Trial of Viviane Amsalem (2015) 200 Kumiko, The Treasure Hunter (2015) 174 Length: 146, dtype: int64 100 ‘‘‘ #will actually return a Series object with a boolean value for each film series_custom > 50 series_greater_than_50 = series_custom[series_custom > 50] criteria_one = series_custom > 50 criteria_two = series_custom < 75 both_criteria = series_custom[criteria_one & criteria_two] print(both_criteria) ‘‘‘ Avengers: Age of Ultron (2015) 74 The Water Diviner (2015) 63 Unbroken (2014) 51 Southpaw (2015) 59 Insidious: Chapter 3 (2015) 59 The Man From U.N.C.L.E. (2015) 68 Run All Night (2015) 60 5 Flights Up (2015) 52 Welcome to Me (2015) 71 Saint Laurent (2015) 51 Maps to the Stars (2015) 60 Pitch Perfect 2 (2015) 67 The Age of Adaline (2015) 54 The DUFF (2015) 71 Ricki and the Flash (2015) 64 Unfriended (2015) 60 American Sniper (2015) 72 The Hobbit: The Battle of the Five Armies (2014) 61 Paper Towns (2015) 55 Big Eyes (2014) 72 Maggie (2015) 54 Focus (2015) 57 The Second Best Exotic Marigold Hotel (2015) 62 The 100-Year-Old Man Who Climbed Out the Window and Disappeared (2015) 67 Escobar: Paradise Lost (2015) 52 Into the Woods (2014) 71 Inherent Vice (2014) 73 Magic Mike XXL (2015) 62 Woman in Gold (2015) 52 The Last Five Years (2015) 60 Jurassic World (2015) 71 Minions (2015) 54 Spare Parts (2015) 52 dtype: int64 ‘‘‘ #data alignment same index rt_critics = Series(fandango[‘RottenTomatoes‘].values, index=fandango[‘FILM‘]) rt_users = Series(fandango[‘RottenTomatoes_User‘].values, index=fandango[‘FILM‘]) rt_mean = (rt_critics + rt_users)/2 print(rt_mean) ‘‘‘ FILM Avengers: Age of Ultron (2015) 80.0 Cinderella (2015) 82.5 Ant-Man (2015) 85.0 Do You Believe? (2015) 51.0 Hot Tub Time Machine 2 (2015) 21.0 ... Mr. Holmes (2015) 82.5 ‘71 (2015) 89.5 Two Days, One Night (2014) 87.5 Gett: The Trial of Viviane Amsalem (2015) 90.5 Kumiko, The Treasure Hunter (2015) 75.0 Length: 146, dtype: float64 ‘‘‘
# pandas_5 import pandas as pd #will return a new DataFrame that is indexed by the values in the specified column #and will drop that column from the DataFrame #without the FILM column dropped fandango = pd.read_csv(‘fandango_score_comparison.csv‘) print (type(fandango)) fandango_films = fandango.set_index(‘FILM‘, drop=False) #print(fandango_films.index) ‘‘‘ <class ‘pandas.core.frame.DataFrame‘> ‘‘‘ # Slice using either bracket notation or loc[] fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"] fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"] # Specific movie fandango_films.loc[‘Kumiko, The Treasure Hunter (2015)‘] # Selecting list of movies movies = [‘Kumiko, The Treasure Hunter (2015)‘, ‘Do You Believe? (2015)‘, ‘Ant-Man (2015)‘] fandango_films.loc[movies] #When selecting multiple rows, a DataFrame is returned, #but when selecting an individual row, a Series object is returned instead ‘‘‘ FILM RottenTomatoes RottenTomatoes_User Metacritic Metacritic_User IMDB Fandango_Stars Fandango_Ratingvalue RT_norm RT_user_norm ... IMDB_norm RT_norm_round RT_user_norm_round Metacritic_norm_round Metacritic_user_norm_round IMDB_norm_round Metacritic_user_vote_count IMDB_user_vote_count Fandango_votes Fandango_Difference FILM Kumiko, The Treasure Hunter (2015) Kumiko, The Treasure Hunter (2015) 87 63 68 6.4 6.7 3.5 3.5 4.35 3.15 ... 3.35 4.5 3.0 3.5 3.0 3.5 19 5289 41 0.0 Do You Believe? (2015) Do You Believe? (2015) 18 84 22 4.7 5.4 5.0 4.5 0.90 4.20 ... 2.70 1.0 4.0 1.0 2.5 2.5 31 3136 1793 0.5 Ant-Man (2015) Ant-Man (2015) 80 90 64 8.1 7.8 5.0 4.5 4.00 4.50 ... 3.90 4.0 4.5 3.0 4.0 4.0 627 103660 12055 0.5 3 rows × 22 columns ‘‘‘ #The apply() method in Pandas allows us to specify Python logic #The apply() method requires you to pass in a vectorized operation #that can be applied over each Series object. import numpy as np # returns the data types as a Series types = fandango_films.dtypes #print types # filter data types to just floats, index attributes returns just column names float_columns = types[types.values == ‘float64‘].index # use bracket notation to filter columns to just float columns float_df = fandango_films[float_columns] #print float_df # `x` is a Series object representing a column deviations = float_df.apply(lambda x: np.std(x)) print(deviations) ‘‘‘ Metacritic_User 1.505529 IMDB 0.955447 Fandango_Stars 0.538532 Fandango_Ratingvalue 0.501106 RT_norm 1.503265 RT_user_norm 0.997787 Metacritic_norm 0.972522 Metacritic_user_nom 0.752765 IMDB_norm 0.477723 RT_norm_round 1.509404 RT_user_norm_round 1.003559 Metacritic_norm_round 0.987561 Metacritic_user_norm_round 0.785412 IMDB_norm_round 0.501043 Fandango_Difference 0.152141 dtype: float64 ‘‘‘ rt_mt_user = float_df[[‘RT_user_norm‘, ‘Metacritic_user_nom‘]] rt_mt_user.apply(lambda x: np.std(x), axis=1) ‘‘‘ FILM Avengers: Age of Ultron (2015) 0.375 Cinderella (2015) 0.125 Ant-Man (2015) 0.225 Do You Believe? (2015) 0.925 Hot Tub Time Machine 2 (2015) 0.150 ... Mr. Holmes (2015) 0.025 ‘71 (2015) 0.175 Two Days, One Night (2014) 0.250 Gett: The Trial of Viviane Amsalem (2015) 0.200 Kumiko, The Treasure Hunter (2015) 0.025 Length: 146, dtype: float64 ?‘‘‘
以上是关于数据分析处理库pandas的主要内容,如果未能解决你的问题,请参考以下文章
pandas GroupBy上的方法apply:一般性的“拆分-应用-合并”
程序员用于机器学习编程的Python 数据处理库 pandas 进阶教程