[TOC]
# numerical value
age_mean = df['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean)
# categorical value
from scipy.stats import mode
mode_embarked = mode(df['Embarked'])[0][0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)
# missing values (replace)
train_file['Name'].fillna("Nameless", inplace=True)
#####
# drop
#####
# dataframe
df.fillna(0) # Fill in missing data with zeros
df.drop_duplicates() # Drop duplicates
# column
data.drop_duplicates(subset='k1') # duplicate in column k1 only
df.dropna(axis=1, how='all') # Drop column if they only contain missing values
# drop columns with column names where the first three letters of the column names was 'pre'
cols = [c for c in df.columns if c.lower()[:3] != 'pre']
df=df[cols]
# rows
df_no_missing = df.dropna()
df.dropna(thresh=5) # Drop rows that contain less than five observations
df.drop(['Cochice', 'Pima']) # drop rows
df_cleaned = df.dropna(how='all') # Drop rows where all cells in that row is NA
df = df[df.name != 'Tina'] # Drop a row if it contains a certain value
df.drop('reports', axis=1) # drop column
cities <- c("Adelaide", "Brisbane", "Canberra", "Darwin")
levels(ds$location)
# fill in missing data
df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True) # mean by other column category