pandas数据清洗

Posted languid

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pandas数据清洗相关的知识,希望对你有一定的参考价值。

import pandas as pd
import numpy as np
from pandas import DataFrame
import datetime
import sys
import pymysql
import csv
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


# db = pymysql.connect(localhost, root, 123456, languid)
engine = create_engine(mysql+pymysql://root:[email protected]/languid?charset=utf8)
db = scoped_session(sessionmaker(bind=engine))


col_list = [user, tm_type, serv, app, record_time, up_flux, down_flux]#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量

filepath=C://百度网盘//20181007_flux_40.csv
# def data_deal(filepath):
if __name__ == __main__:
    df_flux = pd.read_csv(filepath, sep=,, error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine=python,encoding = "utf-8",nrows=22222)
    df_flux.dropna(how=all,inplace=True)
    df_flux.dropna(subset=[user],inplace=True,axis=0)
    df_flux[record_time]=2019-5-28
    df_flux2 = df_flux.groupby(by=[user,tm_type,serv,app,record_time])[up_flux,down_flux].sum()
    df_flux3 = df_flux.groupby(by=[user, tm_type, serv, app, record_time]).count()
    df_flux4 = df_flux3.drop([down_flux], axis=1)
    df_flux5 = df_flux4.rename(columns=up_flux: counts, inplace=False)
  
    df_flux2=DataFrame(df_flux2)
    df_flux2 = df_flux2.rename(columns=up_flux: up_flux_sum,down_flux:down_flux_sum)
   
    result = pd.concat([df_flux5, df_flux2], axis=1)
    
    print(result)

1.清洗数据中的全空行 2.清洗user列中的空值的行 3.统计上行流量列以及下行流量列的当天每人每终端服务app的总量。 4.统计每人每天终端服务app的次数。

 

import pandas as pd
import numpy as np
from pandas import DataFrame
import datetime
import sys
import pymysql
import csv
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


# db = pymysql.connect(localhost, root, 123456, languid)
engine = create_engine(mysql+pymysql://root:[email protected]/languid?charset=utf8)
db = scoped_session(sessionmaker(bind=engine))


col_list = [user, tm_type, serv, app, record_time, up_flux, down_flux]#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量
student_list=[user,age,low,high,time]

filepath=C://百度网盘//20181007_flux_40.csv
filepath2=C://百度网盘//v_student_net.csv
# def data_deal(filepath):
if __name__ == __main__:
    df_flux = pd.read_csv(filepath, sep=,, error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine=python,encoding = "utf-8")
    df_flux.dropna(how=all,inplace=True)
    df_flux.dropna(subset=[user],inplace=True,axis=0)
    df_flux[record_time]=2019-5-28
    df_flux2 = df_flux.groupby([user, tm_type, serv, app, record_time], as_index=False)[up_flux, down_flux].sum()
    df_flux3 = df_flux.groupby(by=[user, tm_type, serv, app, record_time],as_index=False).count()
    df_flux4 = df_flux3.drop([down_flux], axis=1)
    df_flux5 = df_flux4.rename(columns=up_flux: counts, inplace=False)
    df_flux2=DataFrame(df_flux2)
    df_flux2 = df_flux2.rename(columns=up_flux: up_flux_sum,down_flux:down_flux_sum)
    result = pd.concat([df_flux2, df_flux5[counts]], axis=1)
    result_1 = df_flux2[~df_flux2[user].str.contains(10\.)]
    result_1[down_flux_sum] = result_1[down_flux_sum].astype(float)
    # result_1[user] = result_1[user].astype(float)
    # qqq = result_1[result_1[user]]
    result_1[tm_type].replace(\/移动终端\/\w*系统移动终端,mobile,regex=True,inplace=True)
    result_1.loc[result_1[tm_type].str.contains(多终端),tm_type]=多终端
    result_1.loc[result_1[tm_type].str.contains(未知类型), tm_type] = Unknown
    result_1[tm_type].replace(\/PC\/MAC PC,PC,regex=True,inplace=True)




    v_student = pd.read_csv(filepath2,sep=,,error_bad_lines=False,engine=python,encoding=utf-8,header=0,index_col=[0])
    unique_value = v_student[username].nunique()
    v_student = v_student.rename(columns=username: user, inplace=False)
    student_merge=pd.merge(v_student,result_1,how=inner)
    student_group = student_merge.groupby([class_code],as_index=False)[down_flux_sum]
    student_group_2 =student_merge.groupby([class_code],as_index=False)[up_flux_sum].count()
    student_group_3 = student_group_2.rename(columns=up_flux_sum: counts, inplace=False)

1.用正则表达以及loc清洗tm_type列的数据,做以下更改

系统移动终端=mobile()

pc=pc()

多终端=多终端()

未知=unknown()

2.ip数据过滤() 将user列中的为ip的数据行过滤

3.类型转换=上行流量转化成其他类型()

以上是关于pandas数据清洗的主要内容,如果未能解决你的问题,请参考以下文章

2.pandas数据清洗

#yyds干货盘点#Pandas数据清洗实用指南

用pandas进行数据清洗(Data Analysis Pandas Data Munging/Wrangling)

pandas 文本处理大全(附代码)

数据分析03 /基于pandas的数据清洗级联合并

Pandas:数据清洗