pandas数据清洗
Posted languid
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pandas数据清洗相关的知识,希望对你有一定的参考价值。
import pandas as pd import numpy as np from pandas import DataFrame import datetime import sys import pymysql import csv from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker # db = pymysql.connect(‘localhost‘, ‘root‘, ‘123456‘, ‘languid‘) engine = create_engine(‘mysql+pymysql://root:[email protected]/languid?charset=utf8‘) db = scoped_session(sessionmaker(bind=engine)) col_list = [‘user‘, ‘tm_type‘, ‘serv‘, ‘app‘, ‘record_time‘, ‘up_flux‘, ‘down_flux‘]#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量 filepath=‘C://百度网盘//20181007_flux_40.csv‘ # def data_deal(filepath): if __name__ == ‘__main__‘: df_flux = pd.read_csv(filepath, sep=‘,‘, error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine=‘python‘,encoding = "utf-8",nrows=22222) df_flux.dropna(how=‘all‘,inplace=True) df_flux.dropna(subset=[‘user‘],inplace=True,axis=0) df_flux[‘record_time‘]=‘2019-5-28‘ df_flux2 = df_flux.groupby(by=[‘user‘,‘tm_type‘,‘serv‘,‘app‘,‘record_time‘])[‘up_flux‘,‘down_flux‘].sum() df_flux3 = df_flux.groupby(by=[‘user‘, ‘tm_type‘, ‘serv‘, ‘app‘, ‘record_time‘]).count() df_flux4 = df_flux3.drop([‘down_flux‘], axis=1) df_flux5 = df_flux4.rename(columns=‘up_flux‘: ‘counts‘, inplace=False) df_flux2=DataFrame(df_flux2) df_flux2 = df_flux2.rename(columns=‘up_flux‘: ‘up_flux_sum‘,‘down_flux‘:‘down_flux_sum‘) result = pd.concat([df_flux5, df_flux2], axis=1) print(result)
1.清洗数据中的全空行 2.清洗user列中的空值的行 3.统计上行流量列以及下行流量列的当天每人每终端服务app的总量。 4.统计每人每天终端服务app的次数。
import pandas as pd import numpy as np from pandas import DataFrame import datetime import sys import pymysql import csv from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker # db = pymysql.connect(‘localhost‘, ‘root‘, ‘123456‘, ‘languid‘) engine = create_engine(‘mysql+pymysql://root:[email protected]/languid?charset=utf8‘) db = scoped_session(sessionmaker(bind=engine)) col_list = [‘user‘, ‘tm_type‘, ‘serv‘, ‘app‘, ‘record_time‘, ‘up_flux‘, ‘down_flux‘]#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量 student_list=[‘user‘,‘age‘,‘low‘,‘high‘,‘time‘] filepath=‘C://百度网盘//20181007_flux_40.csv‘ filepath2=‘C://百度网盘//v_student_net.csv‘ # def data_deal(filepath): if __name__ == ‘__main__‘: df_flux = pd.read_csv(filepath, sep=‘,‘, error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine=‘python‘,encoding = "utf-8") df_flux.dropna(how=‘all‘,inplace=True) df_flux.dropna(subset=[‘user‘],inplace=True,axis=0) df_flux[‘record_time‘]=‘2019-5-28‘ df_flux2 = df_flux.groupby([‘user‘, ‘tm_type‘, ‘serv‘, ‘app‘, ‘record_time‘], as_index=False)[‘up_flux‘, ‘down_flux‘].sum() df_flux3 = df_flux.groupby(by=[‘user‘, ‘tm_type‘, ‘serv‘, ‘app‘, ‘record_time‘],as_index=False).count() df_flux4 = df_flux3.drop([‘down_flux‘], axis=1) df_flux5 = df_flux4.rename(columns=‘up_flux‘: ‘counts‘, inplace=False) df_flux2=DataFrame(df_flux2) df_flux2 = df_flux2.rename(columns=‘up_flux‘: ‘up_flux_sum‘,‘down_flux‘:‘down_flux_sum‘) result = pd.concat([df_flux2, df_flux5[‘counts‘]], axis=1) result_1 = df_flux2[~df_flux2[‘user‘].str.contains(‘10\.‘)] result_1[‘down_flux_sum‘] = result_1[‘down_flux_sum‘].astype(float) # result_1[‘user‘] = result_1[‘user‘].astype(float) # qqq = result_1[result_1[‘user‘]] result_1[‘tm_type‘].replace(‘\/移动终端\/\w*系统移动终端‘,‘mobile‘,regex=True,inplace=True) result_1.loc[result_1[‘tm_type‘].str.contains(‘多终端‘),‘tm_type‘]=‘多终端‘ result_1.loc[result_1[‘tm_type‘].str.contains(‘未知类型‘), ‘tm_type‘] = ‘Unknown‘ result_1[‘tm_type‘].replace(‘\/PC\/MAC PC‘,‘PC‘,regex=True,inplace=True) v_student = pd.read_csv(filepath2,sep=‘,‘,error_bad_lines=False,engine=‘python‘,encoding=‘utf-8‘,header=0,index_col=[0]) unique_value = v_student[‘username‘].nunique() v_student = v_student.rename(columns=‘username‘: ‘user‘, inplace=False) student_merge=pd.merge(v_student,result_1,how=‘inner‘) student_group = student_merge.groupby([‘class_code‘],as_index=False)[‘down_flux_sum‘] student_group_2 =student_merge.groupby([‘class_code‘],as_index=False)[‘up_flux_sum‘].count() student_group_3 = student_group_2.rename(columns=‘up_flux_sum‘: ‘counts‘, inplace=False)
1.用正则表达以及loc清洗tm_type列的数据,做以下更改
系统移动终端=mobile()
pc=pc()
多终端=多终端()
未知=unknown()
2.ip数据过滤() 将user列中的为ip的数据行过滤
3.类型转换=上行流量转化成其他类型()
以上是关于pandas数据清洗的主要内容,如果未能解决你的问题,请参考以下文章