df = pd.read_csv('pass_to_csv.csv')
df.head()
df.tail()
df.describe()
df.shape
# Check NaN
df.isnull().sum()
# Number of each value under a particular column
df['column_name'].value_counts()
df.sort_values(by="column_name", ascending=True)
# Get example of values
df['column_name'].unique()
# One Hot Vectorize. dummy_na will make one column for NaN
pd.get_dummies(df['column_name'], dummy_na=True)
# Categolize the numerical column. bins:threshold labels:value after categorized
pd.cut(list(df['Age']), bins=[0,15,60,80], labels=['young','adult','old'], right=True)
# Fill NaN by a value. Should not be used for numerical values
df.fillna(value={'Age': 'adult'}, inplace=True)
# Delete columns
df.drop(['column_name'], axis=1, inplace=True)
# Create df
matrix = np.random.randn(6,4)
matrix
array([[-0.62613284, -0.18401896, 0.36230433, 0.25385535],
[-0.52775779, -1.04001947, -0.70918754, -0.42403257],
[ 0.66380938, 1.5464776 , -0.88366703, -0.25407405],
[ 0.90132314, -0.74399435, -2.41854429, 0.01882765],
[-0.09548835, 0.59397054, -0.76961208, -0.89502256],
[ 0.21945467, -1.21903131, -1.74845886, 1.99341913]])
df2 = pd.DataFrame(matrix, columns=list('ABCD'))
# Unlimit the number of columns to be shown
pd.options.display.max_columns = None