关于机器学习二分类建模的一些代码
Posted 卖山楂啦prss
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了关于机器学习二分类建模的一些代码相关的知识,希望对你有一定的参考价值。
之前写的一些代码
# 忽略警告
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
# 分类算法
from sklearn.svm import SVC,LinearSVC #支持向量机
from sklearn.linear_model import LogisticRegression #逻辑回归
from sklearn.neighbors import KNeighborsClassifier #KNN算法
from sklearn.cluster import KMeans #K-Means 聚类算法
from sklearn.naive_bayes import GaussianNB #朴素贝叶斯
from sklearn.tree import DecisionTreeClassifier #决策树
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
# 分类算法--集成学习
import xgboost as xgb
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.ensemble import GradientBoostingClassifier
# 模型评估
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix #混淆矩阵
from sklearn.metrics import silhouette_score #轮廓系数(评价k-mean聚类效果)
from sklearn.model_selection import GridSearchCV #交叉验证
from sklearn.metrics import make_scorer
from sklearn.ensemble import VotingClassifier #投票
# 数据处理
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit #分层抽样
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss
from sklearn import feature_selection
from sklearn.utils import shuffle
from sklearn import metrics
from tqdm import tqdm
import time
from scipy import stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
from sklearn.utils import shuffle
# 深度学习包
import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras import Sequential, utils, regularizers, Model, Input
from tensorflow.keras.layers import Flatten, Dense, Conv1D, MaxPool1D, Dropout, AvgPool1D
# 设置
pd.set_option('display.max_columns', None) # 显示所有列
pd.set_option( 'display.precision',2) # 设置float列的精度
pd.set_option('display.float_format', '{:,.2f}'.format)
pd.set_option('display.float_format','{:,}'.format) # 用逗号格式化大值数字
pd.set_option('display.max_info_columns', 200) # info输出最大列数
pd.set_option('display.max_rows',None) # #显示Dateframe所有行
# pd.reset_option('all') #重置所有设置选项
# 列重命名
import pandas as pd
col_name = ['X'+str(x) for x in range(1,267)]
# 正样本
data_pos = pd.DataFrame()
for i in range(4):
path = '/dev/shm/test/00000{}_0'.format(i)
dat_1 = pd.read_table(path,sep = '|',header=None,names = col_name,encoding='utf-8',engine='python')
data_pos = pd.concat([data_pos,dat_1],axis=0)
# 负样本
data_all_1 = pd.DataFrame()
for i in range(30):
if i<=9:
path = '/dev/shm/test/00000{}_0'.format(i)
elif i<=99:
path = '/dev/shm/test/0000{}_0'.format(i)
elif i<=119:
path = '/dev/shm/test/000{}_0'.format(i)
col_name = ['X'+str(x) for x in range(1,267)]
dat_2 = pd.read_table(path,sep = '|',header=None,names = col_name,encoding='utf-8',engine='python')
data_all_1 = pd.concat([data_all_1,dat_2],axis=0)
data_all_2 = pd.DataFrame()
for i in range(30,60):
if i<=9:
path = '/dev/shm/test/00000{}_0'.format(i)
elif i<=99:
path = '/dev/shm/test/0000{}_0'.format(i)
elif i<=119:
path = '/dev/shm/test/000{}_0'.format(i)
col_name = ['X'+str(x) for x in range(1,267)]
dat_2 = pd.read_table(path,sep = '|',header=None,names = col_name,encoding='utf-8',engine='python')
data_all_2 = pd.concat([data_all_2,dat_2],axis=0)
data_all_3 = pd.DataFrame()
for ii in range(60,90):
if ii<=9:
path = '/dev/shm/test/00000{}_0'.format(ii)
elif ii<=99:
path = '/dev/shm/test/0000{}_0'.format(ii)
elif ii<=119:
path = '/dev/shm/test/000{}_0'.format(ii)
col_name_3 = ['X'+str(c) for c in range(1,267)]
dat_3 = pd.read_table(path,sep = '|',header=None,names = col_name_3,encoding='utf-8',engine='python')
data_all_3 = pd.concat([data_all_3,dat_3],axis=0)
data_all_4 = pd.DataFrame()
for i in range(90,120):
if i<=9:
path = '/dev/shm/test/00000{}_0'.format(i)
elif i<=99:
path = '/dev/shm/test/0000{}_0'.format(i)
elif i<=119:
path = '/dev/shm/test/000{}_0'.format(i)
col_name = ['X'+str(x) for x in range(1,267)]
dat_2 = pd.read_table(path,sep = '|',header=None,names = col_name,encoding='utf-8',engine='python')
data_all_4 = pd.concat([data_all_4,dat_2],axis=0)
data_all_1.to_csv('/dev/shm/test/data_all_1.csv',index=0)
data_all_2.to_csv('/dev/shm/test/data_all_2.csv',index=0)
data_all_3.to_csv('/dev/shm/test/data_all_3.csv',index=0)
data_all_4.to_csv('/dev/shm/test/data_all_4.csv',index=0)
col_name = ['X'+str(x) for x in range(1,267)]
data_all_1 = pd.read_csv('/cmyy/data_all_1.csv' ,header=None ,encoding='utf-8',engine='python')
data_all_2 = pd.read_csv('/cmyy/data_all_2.csv' ,header=None ,encoding='utf-8',engine='python')
data_all_3 = pd.read_csv('/cmyy/data_all_3.csv' ,header=None ,encoding='utf-8',engine='python')
data_all_4 = pd.read_csv('/cmyy/data_all_4.csv' ,header=None ,encoding='utf-8',engine='python')
data_all = pd.concat([data_all_1,data_all_2,data_all_3,data_all_4],axis=0)
data_all.to_csv('/cmyy/data_all.csv', columns=['name'], index=0)
# 修改列名
df.columns= col_name
# 删除行
data.drop(index = [0],inplace = True)
import pandas as pd
def read_single_csv(input_path):
import pandas as pd
df_chunk=pd.read_csv(input_path,chunksize=1000000,encoding='utf-8')
res_chunk=[]
for chunk in df_chunk:
res_chunk.append(chunk)
res_df=pd.concat(res_chunk)
return res_df
data_all = read_single_csv('/cmyy/data_all.csv')
# 负样本抽样
# 1:1
data_all_1['X266'] = data_all_1['X266'].astype(str)
sample_rate = ([(data_all_1['X266'].value_counts()).values]/sum((data_all_1['X266'].value_counts()).values)).tolist()[0]
sample_num = [int(round(i * 719354,0)) for i in sample_rate]
df_data = []
for cla,sn in zip(data_all_1['X266'].unique(),sample_num):
# 拆分
class_data = data_all_1[data_all_1['X266'].isin([cla])]
exec('data_%s = class_data'%cla)
data_name = 'data_' + str(cla)
data_df = eval(data_name)
#print(data_name)
#print(data_df)
# 抽样
exec('data_sample_%s = data_df.sample(n=sn,axis=0)'%cla)
data_sample_name = 'data_sample_' + str(cla)
data_sample_df = eval(data_sample_name)
df_data.append(data_sample_df)
#data_sample_df.to_csv(data_sample_name+'.csv')
data_all_sample_11_01 = pd.concat(df_data,axis=0)
data_all_sample_11_01.to_csv('/cmyy/data_all_sample_11_01.csv', index=0)
df_data = []
for cla,sn in zip(data_all_1['X266'].unique(),sample_num):
# 拆分
class_data = data_all_1[data_all_1['X266'].isin([cla])]
exec('data_%s = class_data'%cla)
data_name = 'data_' + str(cla)
data_df = eval(data_name)
#print(data_name)
#print(data_df)
# 抽样
exec('data_sample_%s = data_df.sample(n=sn,axis=0)'%cla)
data_sample_name = 'data_sample_' + str(cla)
data_sample_df = eval(data_sample_name)
df_data.append(data_sample_df)
#data_sample_df.to_csv(data_sample_name+'.csv')
data_all_sample_11_02 = pd.concat(df_data,axis=0)
data_all_sample_11_02.to_csv('/cmyy/data_all_sample_11_02.csv', index=0)
# 1:2
data_all_1['X266'] = data_all_1['X266'].astype(str)
sample_rate = ([(data_all_1['X266'].value_counts()).values]/sum((data_all_1['X266'].value_counts()).values)).tolist()[0]
sample_num = [int(round(i * 719354*2,0)) for i in sample_rate]
df_data = []
for cla,sn in zip(data_all_1['X266'].unique(),sample_num):
# 拆分
class_data = data_all_1[data_all_1['X266'].isin([cla])]
exec('data_%s = class_data'%cla)
data_name = 'data_' + str(cla)
data_df = eval(data_name)
#print(data_name)
#print(data_df)
# 抽样
exec('data_sample_%s = data_df.sample(n=sn,axis=0)'%cla)
data_sample_name = 'data_sample_' + str(cla)
data_sample_df = eval(data_sample_name)
df_data.append(data_sample_df)
#data_sample_df.to_csv(data_sample_name+'.csv')
data_all_sample_12_01 = pd.concat(df_data,axis=0)
data_all_sample_12_01.to_csv('/cmyy/data_all_sample_12_01.csv', index=0)
df_data = []
for cla,sn in zip(data_all_1['X266'].unique(),sample_num):
# 拆分
class_data = data_all_1[data_all_1['X266'].isin([cla])]
exec('data_%s = class_data'%cla)
data_name = 'data_' + str(cla)
data_df = eval(data_name)
#print(data_name)
#print(data_df)
# 抽样
exec('data_sample_%s = data_df.sample(n=sn,axis=0)'%cla)
data_sample_name = 'data_sample_' + str(cla)
data_sample_df = eval(data_sample_name)
df_data.append(data_sample_df)
#data_sample_df.to_csv(data_sample_name+'.csv')
data_all_sample_12_02 = pd.concat(df_data,axis=0)
data_all_sample_12_02.to_csv('/cmyy/data_all_sample_12_02.csv', index=0)
# 1:3
data_all['X266'] = data_all['X266'].astype(str)
sample_rate = ([(data_all['X266'].value_counts()).values]/sum((data_all['X266'].value_counts()).values)).tolist()[0]
sample_num = [int(round(i * 719354*3,0)) for i in sample_rate]
df_data = []
for cla,sn in zip(data_all['X266'].unique(),sample_num):
# 拆分
class_data = data_all[data_all['X266'].isin([cla])]
exec('data_%s = class_data'%cla)
data_name = 'data_' + str(cla)
data_df = eval(data_name)
#print(data_name)
#print(data_df)
# 抽样
exec('data_sample_%s = data_df.sample(n=sn,axis=0)'%cla)
data_sample_name = 'data_sample_' + str(cla)
data_sample_df = eval(data_sample_name)
df_data.append(data_sample_df)
#data_sample_df.to_csv(data_sample_name+'.csv')
data_all_sample_13_01 = pd.concat(df_data,axis=0)
data_all_sample_13_01.to_csv('/cmyy/data_all_sample_13_01.csv', index=0)
# 负样本抽样
# 1:3
df_data = 以上是关于关于机器学习二分类建模的一些代码的主要内容,如果未能解决你的问题,请参考以下文章
机器学习实验四基于Logistic Regression二分类算法实现手部姿态识别
机器学习:朴素贝叶斯分类器实现二分类(伯努利型) 代码+项目实战
数学建模暑期集训11:逻辑回归(Logistic Regression)处理二分类问题