numpypandas数据处理
Posted 爱码哥
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了numpypandas数据处理相关的知识,希望对你有一定的参考价值。
import os import numpy as np os.chdir('/Users/p/Desktop/Exercise/') with open('alice.txt') as f: result = f.readlines() result = ' '.join(result) positive_list = ['good', 'nice', 'friendly', 'great', 'clean', 'comfortable', 'amazing', 'enjoyable', 'wonderful', 'great'] negative_list = ['poor', 'bad', 'unfriendly', 'horrible', 'dirty', 'uncomfortable'] my_string = result.replace(',', '') my_string = my_string.replace('.', '') my_string = my_string.replace('\\n', ' ') total_words = my_string.split(' ') found = found.setdefault('positive', 0) found.setdefault('negative', 0) found.setdefault('other', 0) total_words.remove('') for value in total_words: if value in positive_list: found['positive'] += 1 if value in negative_list: found['negative'] += 1 if value not in positive_list and value not in negative_list: found['other'] += 1 print('positive is:%f%%,negative is:%f%%,other is %f%%'%(found['positive'] / len(total_words) * 100,found['negative'] / len(total_words) * 100,found['other'] / len(total_words) * 100))
import numpy as np import pandas as pd from pandas import Series,DataFrame import os os.chdir('/Users/p/Desktop/Exercise/') df = pd.read_csv('Drinks.csv') print(type(df)) print(df.shape) print('数据集中的变量%d条数据,变量%d个:'%(df.shape[0],df.shape[1]))#数据的行与列数,因为计数是从0开始,所以虽然最后一行数据计数192,但实际是193行数据。 num_nan = df.isna().sum() #isna()用于判断缺失值。 num_zero = (df == 0).sum() #df == 0用于判断0元素,将两者相加。 print('数据中缺失值(NaN或者0)的数量为:',(num_nan+num_zero).sum()) print('相应变量的中位数:\\n',df.median(axis=0)) #先查看一下相应变量中位数 quant_keys = ["beer_servings", "spirit_servings", "wine_servings", "total_litres_of_pure_alcohol"] median = df.median(axis=0) for k in quant_keys: df[k] = df[k].replace(np.nan, median[k])#用相应中位数替换缺失值和0值 print(df) # beer_ave = "AF": 0, "AS": 0, "EU": 0, "OC": 0, "SA": 0 # wine_ave = "AF": 0, "AS": 0, "EU": 0, "OC": 0, "SA": 0 norepeat_df = df.drop_duplicates(subset=['continent',], keep='first') beer_ave = wine_ave = key = norepeat_df['continent'].values for k in key: beer_ave[k] = 0 wine_ave[k] = 0 for k in beer_ave.keys(): continent_df = df.loc[df["continent"] == k] beer_ave[k] = continent_df["beer_servings"].mean(axis=0) wine_ave[k] = continent_df["wine_servings"].mean(axis=0) print('不同大陆(continent)的平均啤酒消耗(beer_servings)是:',beer_ave) print('不同大陆平均紅酒消耗(wine_servings)是:',wine_ave)
以上是关于numpypandas数据处理的主要内容,如果未能解决你的问题,请参考以下文章