Python数据分析

Posted 2021-09-01 雨宙

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了Python数据分析相关的知识，希望对你有一定的参考价值。

Python数据分析（二）

打卡第六天啦！！！

Numpy库（二）

NAN和INF值的认识

import numpy as np
data = np.random.randint(0,10,size=(3,5))
data = data.astype(np.float)
data[0,1] = np.NAN
print(data)
# [[ 0. nan  1.  2.  5.]
#  [ 4.  2.  4.  4.  9.]
#  [ 9.  4.  5.  9.  1.]]
print(data/0)
# [[nan nan inf inf inf]
#  [inf inf inf inf inf]
#  [inf inf inf inf nan]]
print(np.nan == np.nan)
# False

NAN和INF值的处理

NAN: Not A Number的简写，不是一个数字，但是他是属于浮点类型。
INF:无穷大，在除数为0的情况下会出现INF。
NAN和所有的值进行计算结果都是等于NAN
NAN !=NAN
可以通过np.isnan来判断某个值是不是NAN。
处理值的时候，可以通过删除NAN的形式进行处理，也可以通过值的替换进行处理。
np.delete比较特殊，他通过axis=0来代表行，而其他大部分函数是通过axis=1来代表行。

# 删除缺失值
print(data)
# [[ 4. nan  9.  1.  8.]
#  [ 2.  9. nan  9.  0.]
#  [ 0.  2.  1.  6.  8.]]
np.isnan(data)
# array([[False,  True, False, False, False],
#        [False, False,  True, False, False],
#        [False, False, False, False, False]])
print(data[~np.isnan(data)])
# [4. 9. 1. 8. 2. 9. 9. 0. 0. 2. 1. 6. 8.]

# 替换缺失值
# 使用0替换缺失值
scores = np.loadtxt("scores.csv",dtype=np.str,delimiter=",",skiprows=1)
scores = scores.astype('U3')
scores[scores == ""] = np.NAN
scores1 = scores.astype(np.float)
scores1[np.isnan(scores1)] = 0
scores1.sum(axis=0)    
# 使用平均数替换缺失值
scores2 = scores.astype(np.float)
for x in range(scores2.shape[1]):
    col = scores2[:,x]
    non_nan_col = col[~np.isnan(col)]
    mean = non_nan_col.mean()
    col[np.isnan(col)] = mean
    pass
print(scores2)
# [[81.         76.        ]
#  [75.70588235 84.        ]
#  [90.         72.38888889]
#  [72.         50.        ]
#  [57.         89.        ]
#  [76.         78.        ]
#  [67.         77.        ]
#  [89.         84.        ]
#  [73.         90.        ]
#  [76.         59.        ]
#  [61.         75.        ]
#  [85.         78.        ]
#  [83.         72.38888889]
#  [81.         91.        ]
#  [40.         14.        ]
#  [83.         61.        ]
#  [75.70588235 98.        ]
#  [75.70588235 91.        ]
#  [88.         84.        ]
#  [85.         24.        ]]

random模块

import numpy as np
np.random.seed(1)
np.random.rand()
# 0.417022004702574

np.random.rand(2,3)
# array([[7.20324493e-01, 1.14374817e-04, 3.02332573e-01],
#        [1.46755891e-01, 9.23385948e-02, 1.86260211e-01]])

np.random.randn()
# -1.1059350760083153

data = np.arange(5)
np.random.choice(data,size=(3,4))
# array([[2, 0, 4, 1],
#        [2, 2, 1, 0],
#        [1, 3, 4, 3]])
np.random.choice(5,3)
# array([4, 3, 4])

data1 = np.arange(10)
print(data1)
np.random.shuffle(data1)
print(data1)
# [0 1 2 3 4 5 6 7 8 9]
# [6 2 3 0 1 9 5 8 7 4]

axis轴理解

最外面的括号代表着axis=0，依次往里的括号对应的axis的计数就依次加1
操作方式：如果指定轴进行相关的操作，那么他会使用轴下的每个直接子元素的第0个、第1个、第2个…分别进行相关操作，以上图为例，当axis=0时，对应（0,2）和（1,3），当axis=1时，对应（0,1）和（2,3）。

a = np.arange(0,4).reshape(2,2)
print(a)
# [[0 1]
#  [2 3]]
print(a.sum(axis=0)) # [2 4]
print(a.sum(axis=1)) # [1 5]
print(a.max(axis=0)) # [2 3]
print(a.max(axis=1)) # [1 3]

y = np.arange(24).reshape(2,2,6)
print(y)
# [[[ 0  1  2  3  4  5]
#   [ 6  7  8  9 10 11]]

#  [[12 13 14 15 16 17]
#   [18 19 20 21 22 23]]]
y.max(axis=0)
# array([[12, 13, 14, 15, 16, 17],
#        [18, 19, 20, 21, 22, 23]])
y.max(axis=1)
# array([[ 6,  7,  8,  9, 10, 11],
#        [18, 19, 20, 21, 22, 23]])
y.max(axis=2)
# array([[ 5, 11],
#        [17, 23]])

np.delete是直接删除指定轴下的第几个直接子元素

b = np.delete(a,0,axis=0)
print(b)
# [[2 3]]

通用函数

一元函数

a = np.random.uniform(-10,10,size=(3,5))
print(a)
# [[ 1.02649882 -8.59837008 -0.55049924  4.8552966  -6.16087844]
#  [-0.71376993 -5.39274383  0.16495075 -5.82863377 -9.0139151 ]
#  [ 0.37723752 -6.55443296 -2.07373425 -7.85269394  0.19620556]]
np.abs(a)
# array([[1.02649882, 8.59837008, 0.55049924, 4.8552966 , 6.16087844],
#        [0.71376993, 5.39274383, 0.16495075, 5.82863377, 9.0139151 ],
#        [0.37723752, 6.55443296, 2.07373425, 7.85269394, 0.19620556]])
np.sqrt(np.abs(a))
# array([[1.01316278, 2.93229775, 0.74195636, 2.20347376, 2.48211169],
#        [0.84484906, 2.3222282 , 0.40614129, 2.41425636, 3.00231829],
#        [0.61419664, 2.56016268, 1.44004661, 2.80226586, 0.44295097]])
np.square(a)
# array([[1.05369984e+00, 7.39319680e+01, 3.03049413e-01, 2.35739051e+01,
#         3.79564231e+01],
#        [5.09467517e-01, 2.90816860e+01, 2.72087504e-02, 3.39729716e+01,
#         8.12506655e+01],
#        [1.42308144e-01, 4.29605914e+01, 4.30037374e+00, 6.16648021e+01,
#         3.84966231e-02]])
np.exp(a)
# array([[2.79127596e+00, 1.84406116e-04, 5.76661846e-01, 1.28418775e+02,
#         2.11039861e-03],
#        [4.89794221e-01, 4.54947320e-03, 1.17933504e+00, 2.94209382e-03,
#         1.21704436e-04],
#        [1.45825063e+00, 1.42378999e-03, 1.25715451e-01, 3.88703412e-04,
#         1.21677700e+00]])
np.log(np.abs(a))
# array([[ 0.02615381,  2.15157266, -0.5969297 ,  1.58007019,  1.81821937],
#        [-0.33719459,  1.68505432, -1.80210833,  1.76278263,  2.19876951],
#        [-0.97488027,  1.88014161,  0.72935097,  2.06085665, -1.62859238]])
np.sign(a)
# array([[ 1., -1., -1.,  1., -1.],
#        [-1., -1.,  1., -1., -1.],
#        [ 1., -1., -1., -1.,  1.]])
np.ceil(a)
# array([[ 2., -8., -0.,  5., -6.],
#        [-0., -5.,  1., -5., -9.],
#        [ 1., -6., -2., -7.,  1.]])
np.floor(a)
# array([[  1.,  -9.,  -1.,   4.,  -7.],
#        [ -1.,  -6.,   0.,  -6., -10.],
#        [  0.,  -7.,  -3.,  -8.,   0.]])
# 四舍五入
np.rint(a)
# array([[ 1., -9., -1.,  5., -6.],
#        [-1., -5.,  0., -6., -9.],
#        [ 0., -7., -2., -8.,  0.]])
# 分隔整数和小数部分
np.modf(a)
# (array([[ 0.02649882, -0.59837008, -0.55049924,  0.8552966 , -0.16087844],
#         [-0.71376993, -0.39274383,  0.16495075, -0.82863377, -0.0139151 ],
#         [ 0.37723752, -0.55443296, -0.07373425, -0.85269394,  0.19620556]]),
#  array([[ 1., -8., -0.,  4., -6.],
#         [-0., -5.,  0., -5., -9.],
#         [ 0., -6., -2., -7.,  0.]]))
np.sin(a)
# array([[ 0.85549127, -0.73550238, -0.52311278, -0.98980607,  0.12200217],
#        [-0.65468811,  0.77734954,  0.16420375,  0.43905944, -0.39940052],
#        [ 0.36835364, -0.26793368, -0.87617026, -0.99999917,  0.19494911]])

二元函数

np.add(a,np.random.randint(0,5,size=(3,1)))
np.greater(a,0) # 求出所以大于0的数
# array([[ True, False, False,  True, False],
#        [False, False,  True, False, False],
#        [ True, False, False, False,  True]])
np.logical_and(a>0,a<5)
# array([[ True, False, False,  True, False],
#        [False, False,  True, False, False],
#        [ True, False, False, False,  True]])
np.logical_or(a>5,a<0)
# array([[False,  True,  True, False,  True],
#        [ True,  True, False,  True,  True],
#        [False,  True,  True,  True, False]])

聚合函数

# 计算元素的和
np.sum(a,axis=0)
# array([  0.68996641, -20.54554687,  -2.45928274,  -8.82603111,
#        -14.97858798])

# 计算元素的积
np.prod(a,axis=0)
# array([-2.76395892e-01, -3.03921238e+02,  1.88305984e-01,  2.22229242e+02,
#         1.08960082e+01])

# 计算元素的平均值
np.mean(a,axis=0)
# array([ 0.2299888 , -6.84851562, -0.81976091, -2.94201037, -4.99286266])

# 计算元素的标准差
np.std(a,axis=0)
# array([0.71805082, 1.32510965, 0.93356093, 5.57510549, 3.84965594])

# 计算元素的方差
np.var(a,axis=0)
# array([ 0.51559698,  1.75591558,  0.87153601, 31.08180126, 14.81985088])

# 计算元素的最小值
np.min(a,axis=0)
# array([-0.71376993, -8.59837008, -2.07373425, -7.85269394, -9.0139151 ])

# 找出最小值的索引
np.argmin(a,axis=0)
# array([1, 0, 2, 2, 1], dtype=int64)

布尔判断函数

只要有一个元素为0，则all方法就返回False，只要有一个元素不为0，则any方法就返回True

b = np.arange(0,10)
b.all() #False
b.any() #True

排序

np.sort(a)
# array([[-8.59837008, -6.16087844, -0.55049924,  1.02649882,  4.8552966 ],
#        [-9.0139151 , -5.82863377, -5.39274383, -0.71376993,  0.16495075],
#        [-7.85269394, -6.55443296, -2.07373425,  0.19620556,  0.37723752]])
np.sort(a,axis=0)
# array([[-0.71376993, -8.59837008, -2.07373425, -7.85269394, -9.0139151 ],
#        [ 0.37723752, -6.55443296, -0.55049924, -5.82863377, -6.16087844],
#        [ 1.02649882, -5.39274383,  0.16495075,  4.8552966 ,  0.19620556]])

np.argsort(a)
# array([[1, 4, 2, 0, 3],
#        [4, 3, 1, 0, 2],
#        [3, 1, 2, 4, 0]], dtype=int64)
np.argsort(a,axis=0)
# array([[1, 0, 2, 2, 1],
#        [2, 2, 0, 1, 0],
#        [0, 1, 1, 0, 2]], dtype=int64)

# 降序排序
-np.sort(-a)
# array([[ 4.8552966 ,  1.02649882, -0.55049924, -6.16087844, -8.59837008],
#        [ 0.16495075, -0.71376993, -5.39274383, -5.82863377, -9.0139151 ],
#        [ 0.37723752,  0.19620556, -2.07373425, -6.55443296, -7.85269394]])
indexes = np.argsort(-a)
np.take(a,indexes)
# array([[ 4.8552966 ,  1.02649882, -0.55049924, -6.16087844, -8.59837008],
#        [-0.55049924,  1.02649882, -8.59837008,  4.8552966 , -6.16087844],
#        [ 1.02649882, -6.16087844, -0.55049924, -8.59837008,  4.8552966 ]])

其他函数

c = np.random.randint(0,100,size=(3,20))
np.apply_along_axis(lambda x:x[np.logical_and(x!=x.max(),x!=x.min())].mean(),axis=1,arr=c)
# array([52.61111111, 47.5       , 52.61111111])

np.linspace(0,10,9)
# array([ 0.  ,  1.25,  2.5 ,  3.75,  5.  ,  6.25,  7.5 ,  8.75, 10.  ])

d = np.random.randint(0,10,size=(3,5))
np.unique(d)
# array([0, 2, 3, 4, 5, 6, 7, 8, 9])
np.unique(d,return_counts=True)
# (array([0, 2, 3, 4, 5, 6, 7, 8, 9]),
#  array([1, 1, 1, 1, 4, 1, 2, 2, 2], dtype=int64))

pandas库（一）

介绍

强大的分析结构化数据的工具集
基础是Numpy，提供了高性能矩阵的运算
应用于数据挖掘，数据分析
提供数据清洗功能

# 导入
import pandas as pd

Series

Series介绍

一维标记的数组型对象
由数据和索引组成

Series创建

通过list创建

# 通过list创建
s1 = pd.Series([1,2,3,4,5])
print(s1)
# 0    1
# 1    2
# 2    3
# 3    4
# 4    5
# dtype: int64
print(type(s1))
# <class 'pandas.core.series.Series'>

通过数组创建

# 通过数组创建
import numpy as np
arr1 = np.arange(1,6)
s2 = pd.Series(arr1)
print(s2)
# 0    1
# 1    2
# 2    3
# 3    4
# 4    5
# dtype: int32
# 指定索引
s3 = pd.Series(arr1,index=['a','b','c','d','e'])
print(s3)
# a    1
# b    2
# c    3
# d    4
# e    5
# dtype: int32

通过字典创建

# 通过字典创建
dict = {'name':'潘小雷','age':20}
s4 = pd.Series(dict,index=['name','age'])
print(s4)
# name    潘小雷
# age      20
# dtype: object

Series基本用法

isnull和notnull检查缺失值

s4.isnull() # 判断是否为空，如果为空则为True
# name    False
# age     False
# sex      True
# dtype: bool
s4.notnull() # 判断是否不为空，非空状态为True
# name     True
# age      True
# sex     False
# dtype: bool

通过索引获取数据

print(s4.index)
# Index(['name', 'age', 'sex'], dtype='object')
print(s4.values)
# ['潘小雷' 20 nan]
print(s4[0])
# 潘小雷
print(s4['name'])
# 潘小雷
print(s4[[0,2]])
# name    潘小雷
# sex     NaN
# dtype: object
print(s4[0:1])
# name    潘小雷
# dtype: object
print(s4['name':'age']) # 与索引切片不同的是，标签切片包含末端数据，在此例子中表现为包含name
# name    潘小雷
# age      20
# dtype: object
# 布尔索引
print(s2[s2>3])
# 3    4
# 4    5
# dtype: int32

索引与数据的对应关系不被运算结果所影响

print(s2*2)
# 0     2
# 1     4
# 2     6
# 3     8
# 4    10
# dtype: int32

name属性

s2.name = 'temp'
s2.index.name = 'year'
print(s2)
# year
# 0    1
# 1    2
# 2    3
# 3    4
# 4    5
# Name: temp, dtype: int32

head方法和tail方法截取其中数据

s2.head(3) # 不传参数默认5，显示前5行
# year
# 0    1
# 1    2
# 2    3
# Name: temp, dtype: int32
s2.tail(3) # 不传参数默认5，显示后5行
# year
# 2    3
# 3    4
# 4    5
# Name: temp, dtype: int32

DataFrame

DataFrame介绍

表格型的数据结构
含有一组有序的列，每列可以是不同类型的值
既有行索引，也有列索引
可以看作是由Series组成的字典（并且共用一个索引）

DataFrame构建

字典类
（1）数组、列表或元组组成的字典构造DataFrame

data = {'a':[1,2,3,4],
       'b':(5,6,7,8),
       'c':np.arange(9,13)}
frame = pd.DataFrame(data)
print(frame)
#    a  b   c
# 0  1  5   9
# 1  2  6  10
# 2  3  7  11
# 3  4  8  12
# index属性查看行索引
print(frame.index)
# RangeIndex(start=0, stop=4, step=1)
# columns属性查看列索引
print(frame.columns)
# Index(['a', 'b', 'c'], dtype='object')
# values查看值
print(frame.values)
# [[ 1  5  9]
#  [ 2  6 10]
#  [ 3  7 11]
#  [ 4  8 12]]
# 指定行索引和列索引
frame = pd.DataFrame(data,index=['A','B','C','D'],columns=['a','b','c','d'])
print(frame)
#    a  b   c    d
# A  1  5   9  NaN
# B  2  6  10  NaN
# C  3  7  11  NaN
# D  4  8  12  NaN

（2）Series构成的字典构造DataFrame

# Series构成的字典构造DataFrame
pd1 = pd.DataFrame({'a':pd.Series(np.arange(3)),
                  'b':pd.Series(np.arange(3,5))})
print(pd1)
#    a    b
# 0  0  3.0
# 1  1  4.0
# 2  2  NaN

（3）字典构成的字典构造DataFrame

# 字典构成的字典构造DataFrame
# 字典嵌套
data1 = {'a':{'name':'潘小雷','age':20},
        'b':{'name':'鲸鱼','age':20}}
pd2 = pd.DataFrame(data1)
print(pd2)
#         a   b
# name  潘小雷  鲸鱼
# age    20  20

列表类
（1）2D ndarray构造DataFrame

arr1 = np.arange(12).reshape(4,3)
frame1 = pd.DataFrame(arr1)
print(frame1)
#    0   1   2
# 0  0   1   2
# 1  3   4   5
# 2  6   7   8
# 3  9  10  11

（2）字典构成的列表构造DataFrame

li = [{'name':'潘小雷','age':20},
     {'name':'鲸鱼','age':20}]
frame2 = pd.DataFrame(li)
print(frame2)
#   name  age
# 0  潘小雷   20
# 1   鲸鱼   20

（3）Series构成的列表构造DataFrame

list1 = [pd.Series([1,2,3]),pd.Series(np.random.rand(3))]
frame3 = pd.DataFrame(list1)
print(frame3)
以上是关于Python数据分析的主要内容，如果未能解决你的问题，请参考以下文章