大数据分析及应用
Posted 坏坏-5
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了大数据分析及应用相关的知识,希望对你有一定的参考价值。
Python 数组与曲线绘制(一)
- 第1关 使用函数值填充列表
class Solution:
def solve(self, s, e):
"""
:type s, e: int, int
:rtype: list
"""
#请在此按照“编程要求”填写代码
#********** Begin *********#
import math
dx = (e-s)/40
xlist = [s+i*dx for i in range(0,41)]
def f(x):
return 1/math.sqrt(2*math.pi)*math.exp(-0.5*x**2)
ylist = [f(x) for x in xlist]
return ylist
##********** End **********#
- 第2关 填充数组(循环版本)
class Solution:
def solve(self, s, e):
"""
:type s, e: int, int
:rtype: numpy.ndarray
"""
#请在此按照“编程要求”填写代码
#********** Begin *********#
import numpy as np
xlist = np.zeros(41)
ylist = np.zeros(41)
for i in range(41):
xlist[i]=s+i*(e - s)/40
ylist[i]=1/np.sqrt(2*np.pi)*np.exp(-0.5*xlist[i]**2)
return ylist
##********** End **********#
- 第3关 填充数组(向量化版本)
class Solution:
def solve(self, s, e):
"""
:type s, e: int, int
:rtype xlist, ylist: numpy.array, numpy.array
"""
#请在此按照“编程要求”填写代码
#********** Begin *********#
import numpy as np
xlist = np.linspace(s, e, 41)
ylist = 1/np.sqrt(2*np.pi)*np.exp(-0.5*xlist**2)
return xlist, ylist
##********** End **********#
- 第4关 绘制函数
class Solution:
def solve(self, s, e):
"""
:type s, e: int, int
:rtype: None
"""
#请在此按照“编程要求”添加代码
#********** Begin *********#
from matplotlib import pyplot as plt
import math
dx = (e - s) / 40
xlist = [s+i*dx for i in range(0,41)]
def f(x):
return 1/math.sqrt(2*math.pi)*math.exp(-0.5*x**2)
ylist = [f(x) for x in xlist]
plt.plot(xlist, ylist)
plt.show()
##********** End **********#
plt.savefig("step4/stu_img/student.png")
- 第5关 函数作用于向量
class Solution:
def solve_1(self, v):
"""
:type v: list
:rtype: list
"""
#请在此按照“编程要求”添加代码
#********** Begin *********#
import math
def f(x):
return x**3+x*math.exp(x)+1
y = [f(a) for a in v]
return y
##********** End **********#
def solve_2(self, v):
"""
:type v: list
:rtype: numpy.array
"""
#请在此按照“编程要求”添加代码
#********** Begin *********#
import numpy as np
xlist = np.array(v)
ylist = xlist**3+xlist*np.exp(xlist)+1
return ylist
##********** End **********#
- 第6关 手工模拟执行向量表达式
class Solution:
def solve_1(self, x, t):
"""
:type x, t: list, list
:rtype: list
"""
#请在此按照“编程要求:使用math库实现”添加代码
#********** Begin *********#
import math
y = []
for xi, ti in zip(x, t):
y.append(math.cos(math.sin(xi)) + math.exp(1/ti))
return y
##********** End **********#
def solve_2(self, x, t):
"""
:type x, t: list, list
:rtype: numpy.array
"""
#请在此按照“编程要求:使用numpy库实现”添加代码
#********** Begin *********#
import numpy as np
y_1 = np.cos(np.sin(x))+np.exp(1/np.array(t))
return y_1
##********** End **********#
Python 数组与曲线绘制(二)
- 第1关 展示数组切片
[0. 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. 1.1 1.2 1.3 1.4 1.5 1.6 1.7
1.8 1.9 2. 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3. ]
[0. 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. 1.1 1.2 1.3 1.4 1.5 1.6 1.7
1.8 1.9 2. 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3. ]
[0. 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. 1.1 1.2 1.3 1.4 1.5 1.6 1.7
1.8 1.9 2. 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8]
[0. 0.5 1. 1.5 2. 2.5 3. ]
[0.2 0.8 1.4 2. 2.6]
- 第2关 绘制公式
class Solution:
def solve(self, v0, g):
"""
:type v0, g: int, int
:rtype: None
"""
#请在此按照“编程要求”填写代码
#********** Begin *********#
import numpy as np
from matplotlib import pyplot as plt
#生成横坐标
x = np.linspace(0.0, 2*v0/g, 50)
#生成纵坐标
y = v0*x-1/2*g*x*x
#描绘函数图像
plt.plot(x, y)
#添加横坐坐标信息
plt.xlabel('time(s)')
#添加纵坐标信息
plt.ylabel('height(m)')
plt.show()
##********** End **********#
plt.savefig("step2/stu_img/student.png")
- 第3关 绘制多参数公式
class Solution:
def solve(self, v0):
"""
:type v0: List[int]
:rtype: None
"""
#请在此按照“编程要求”填写代码
#********** Begin *********#
import numpy as np
from matplotlib import pyplot as plt
g = 9.81
for v in v0:
t = np.linspace(0,2.0*v/g,50)
y = v*t-0.5*g*t**2
plt.plot(t,y)
plt.xlabel('time(s)')
plt.ylabel('height(m)')
plt.show()
##********** End **********#
plt.savefig("step3/stu_img/student.png")
- 第4关 指定图中轴的范围
class Solution:
def solve(self, v0):
"""
:type v0: List[int]
:rtype: None
"""
#请在此按照“编程要求”填写代码
#********** Begin *********#
import numpy as np
from matplotlib import pyplot as plt
g = 9.81
t1 = 0
y1 = 0
for v in v0:
t = np.linspace(0,2.0*v/g,50)
if max(t) > t1:
t1 = max(t)
y = v*t-0.5*g*t**2
if max(y) > y1:
y1 = max(y)
plt.plot(t,y)
plt.axis([0, t1, 0, y1*1.1])
plt.xlabel('time(s)')
plt.ylabel('height(m)')
plt.show()
##********** End **********#
plt.savefig("step4/stu_img/student.png")
- 第5关 绘制精确和不精确的华氏-摄氏转换公式
class Solution:
def solve(self, s, e):
"""
:type s, e: int, int
:rtype: None
"""
#请在此按照“编程要求”填写代码
#********** Begin *********#
import numpy as np
from matplotlib import pyplot as plt
f = np.linspace(s, e, 50)
c1 = (f - 30) / 2
c2 = (f - 32) * 5 / 9
plt.plot(f, c1, 'r.', f, c2, 'b-')
plt.show()
##********** End **********#
plt.savefig("step5/stu_img/student.png")
- 第6关 绘制球的轨迹
class Solution:
def solve(self, y0, theta, v0):
"""
:type y0, theta, v0: int, int, int
:rtype: None
"""
#请在此按照“编程要求”填写代码
#********** Begin *********#
import numpy as np
from matplotlib import pyplot as plt
g = 9.81
theta = theta/180.0*np.pi
a = -1/(2*v0**2)*g/(np.cos(theta)**2)
b = np.tan(theta)
c = y0
delta = np.sqrt(b**2-4*a*c)
x0 = (-b-delta)/(2*a)
x1 = (-b+delta)/(2*a)
xmin = min(x0, x1)
xmax = max(x0, x1)
x = np.linspace(0,xmax,51)
y = x*np.tan(theta)-1/(2*v0**2)*g*(x**2)/(np.cos(theta)**2)+y0
plt.plot(x,y)
plt.axis([min(x),max(x),0,max(y)*1.1])
plt.show()
##********** End **********#
plt.savefig("step6/stu_img/student.png")
- 第7关 绘制文件中的双列数据
class Solution:
def solve(self, file):
"""
:type file: str
:rtype: None
"""
#请在此按照“编程要求”填写代码
#********** Begin *********#
from matplotlib import pyplot as plt
ifile = open(file, 'r')
x, y = [], []
for line in ifile:
a = line.split()
x.append(float(a[0]))
y.append(float(a[1]))
print(sum(y)/len(y), max(y), min(y))
plt.plot(x, y)
plt.show()
ifile.close()
##********** End **********#
plt.savefig("step7/stu_img/student.png")
Python 数组与曲线绘制(三)
- 第1关 绘图函数 - 绘制 sin 函数
# 请绘制sin函数曲线
import matplotlib
matplotlib.use("Agg") # 设置平台绘图环境,勿删
import matplotlib.pyplot as plt
# 请在此添加代码实现函数细节 #
# ********** Begin *********#
x = [0,30,60,90,120,150,180,210,240,270,300,330,360]
y = [0,0.5,0.866,1,0.866,0.5,0,-0.5,-0.866,-1,-0.866,-0.5,0]
plt.plot(x,y,'.')
plt.show()
# ********** End **********#
plt.savefig('picture/step0/fig0.png') #存储输出图像,勿删
- 第2关 绘图与保存 - 抛物线函数曲线
# 请绘制抛物线曲线
import matplotlib
matplotlib.use("Agg")
def f(x):
# 请在此添加代码实现函数细节 #
# ********** Begin1 *********#
x = list(range(0,51,1))
y = []
for i in range(0,len(x)):
y.append(3*(v[i]**2) + 2*(v[i]) + 1)
return y
# ********** End1 **********#
# 请在此添加代码绘制曲线并存储图像#
# ********** Begin2 *********#
import matplotlib.pyplot as plt
x = list(range(0,51,1))
y = []
for i in range(0,len(x)):
y.append(3*(x[i]**2) + 2*(x[i]) + 1)
plt.plot(x,y,'r--')
plt.show()
plt.savefig('picture/step1/fig1.png')
# ********** End2 **********#
- 第3关 数组计算与向量化处理 - 函数曲线绘制与坐标处理
# 请绘制函数曲线
import matplotlib
matplotlib.use("Agg")
# 请在此添加实现代码 #
# ********** Begin *********#
import numpy as np
import matplotlib.pyplot as plt
t = np.linspace(0,3,50)
y = t**2*np.exp(-t**2)
plt.plot(t,y)
plt.show()
plt.savefig('picture/step2/fig2.png')
# ********** End **********#
- 第4关 图例与坐标设置 - 绘制多条曲线
#请在同一坐标系中绘制两条曲线
import matplotlib
matplotlib.use("Agg")
# 请在此添加实现代码 #
# ********** Begin *********#
import numpy as np
import matplotlib.pyplot as plt
t = np.linspace(0,3,50)
y1 = t**2*np.exp(-t**2)
y2 = t**4*np.exp(-t**2)
plt.plot(t,y1,'r--')
plt.plot(t,y2,'b-o')
plt.title('Plotting two curves in the same plot')
plt.xlabel('t')
plt.ylabel('y')
plt.legend(['y1','y2'])
plt.savefig('picture/step3/fig3.png')
# ********** End **********#
- 第5关 向量化处理 - 绘制函数图形
# 请编写代码实现向量化帽函数并绘制函数曲线
import matplotlib
matplotlib.use("Agg")
# 请在此添加实现代码 #
# ********** Begin *********#
import numpy as np
import matplotlib.pyplot as plt
def H3(x):
return np.where(x<0,0,(np.where(x<1,x,(np.where(x<2,2-x,0)))))
x = np.linspace(-3,5,1000)
y = H3(x)
plt.title('Plotting hat func in this plot')
plt.plot(x,y,'b-')
plt.show()
plt.savefig('picture/step4/fig4.png')
# ********** End **********#
Python 绘图进阶
- 第1关 柱状图 - 商品房销售价格统计图
# 请编写代码绘制住宅商品房平均销售价格柱状图
import matplotlib
matplotlib.use("Agg")
# 请在此添加实现代码 #
# ********** Begin *********#
import matplotlib.pyplot as plt
from numpy import *
xstring = '2015 2014 2013 2012 2011 \\
2010 2009 2008 2007 2006 \\
2005 2004 2003 2002 2001 2000'
ystring = '12914 11826 12997 12306.41 12327.28 \\
11406 10608 8378 8667.02 8052.78 \\
6922.52 5744 4196 4336 4588 4751'
y = ystring.split()
y.reverse()
y = [float(e) for e in y]
xlabels = xstring.split()
xlabels.reverse()
x = range(len(xlabels))
plt.xticks(x, xlabels, rotation = 45)
plt.yticks(range(4000,13500,1000))
plt.ylim(4000,13500)
plt.bar(x, y, color = '#800080')
plt.savefig('picture/step1/fig1.png')
# ********** End **********#
- 第2关 并列柱状图 - 商品房销售价格统计图
# -*- coding: utf-8 -*-
import matplotlib
import re
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
xstring = '2015 2014 2013 2012 2011 \\
2010 2009 2008 2007 2006 \\
2005 2004 2003 2002 2001 2000' #x轴标签
n = 6
ystring = ['']*n #y轴对应的6组数据
ystring[0] = '6793 6324 6237 5790.99 5357.1 5032 4681 3800 3863.9 3366.79 3167.66 2778 2359 2250 2170 2112'
ystring[1] = '6473 5933 5850 5429.93 4993.17 4725 4459 3576 3645.18 3119.25 2936.96 2608 2197 2092 2017 1948'
ystring[2] = '15157 12965 12591 11460.19 10993.92 10934 9662 7801 7471.25 6584.93 5833.95 5576 4145 4154 4348 4288'
ystring[3] = '12914 11826 12997 12306.41 12327.28 11406 10608 8378 8667.02 8052.78 6922.52 5744 4196 4336 4588 4751'
ystring[4] = '9566 9817 9777 9020.91 8488.21 7747 6871 5886 5773.83 5246.62 5021.75 3884 3675.14 3488.57 3273.53 3260.38'
ystring[5] = '4845 5177 4907 4305.73 4182.11 4099 3671 3219 3351.44 3131.31 2829.35 2235 2240.74 1918.83 2033.08 1864.37'
labels = ['Commercial housing', 'Residential commercial housing',
'high-end apartments', 'Office Building', 'Business housing', 'Others'] #图例标签
colors = ['#ff7f50', '#87cefa', '#DA70D6', '#32CD32', '#6495ED', '#FF69B4'] #指定颜色
# 请在此添加实现代码 #
# ********** Begin *********#
x_labels=re.findall(r'\\b\\d+\\b',xstring)[::-1]
ylist=[]
for y in ystring:
ylist.append(list(map(float,re.findall(r'[0-9]+\\.?[0-9]*',y)))[::-1]) #或者使用y.split()
bar_width = 0.8
xindex=np.arange(1,92,6)
fig, ax = plt.subplots()
for i in range(6):
ax.bar(xindex+bar_width*i, ylist[i], bar_width ,color=colors[i])
ax.set_xlim(-1,98) #闭区间
plt.xticks(xindex+bar_width*2.5,x_labels,rotation=45)
ax.set_ylim(1450,15300)
plt.yticks(np.arange(2000,16000,2000))
plt.legend(labels,loc='upper left')
plt.title('Selling Prices of Six Types of Housing')
plt.savefig('picture/step2/fig2.png')
# ********** End **********#
- 第3关 饼状图 - 2010 全国人口普查数据分析
# 请绘制育龄妇女的受教育程度分布饼图
import matplotlib
matplotlib.use("Agg")
# 请在此添加实现代码 #
# ********** Begin *********#
import matplotlib.pyplot as plt
labels = ['none', 'primary', 'junior', 'senior', 'specialties', 'bachelor', 'master'] # 标签
colors = ['red','orange','yellow','green','purple','blue','black'] #指定楔形颜色
womenCount = [2052380, 11315444, 20435242, 7456627, 3014264, 1972395, 185028]
explode = [0,0,0.1,0,0,0,0] # 确定突出部分
plt.pie(womenCount, explode=explode, labels=labels, shadow=True,colors=colors)
plt.axis('equal') # 用于显示为一个长宽相等的饼图
plt.savefig('picture/step3/fig3.png')
# ********** End **********#
- 第4关 多子图绘制 - 2010 全国人口普查数据分析
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
labels = ['none', 'primary', 'junior', 'senior', 'specialties', 'bachelor', 'master'] # 标签
womenCount = [2052380, 11315444, 20435242, 7456627, 3014264, 1972395, 185028]
birthMen = [2795259, 12698141, 13982478, 2887164, 903910, 432333, 35915]
birthWomen = [2417485, 11000637, 11897674, 2493829, 786862, 385718, 32270]
liveMen = [2717613, 12477914, 13847346, 2863706, 897607, 429809, 35704]
liveWomen = [2362007, 10854232, 11815939, 2480362, 783225, 384158, 32136]
# 请在此添加实现代码 #
# ********** Begin *********#
x = np.arange(len(labels))
birth = np.array(birthMen) + np.array(birthWomen)
live = np.array(liveMen) + np.array(liveWomen)
plt.figure(figsize=[14,5]) #设置画布大小
plt.subplot(121)
birthrate = (1.0*live) / (1.0*np.array(womenCount))
plt.plot(x, birthrate, 'r')
plt.xticks(x, labels)
plt.subplot(122)
liverate = (1.0*live) / (1.0*birth) * 100
plt.plot(x, liverate, 'b')
plt.xticks(x, labels)
plt.savefig('picture/step4/fig4.png')
# ********** End **********#
Python数据可视化之折线图
- 第1关 折线图的绘制与优化
# -*- coding: utf-8 -*-
import pandas as pd #用于生成满足绘图要求的数据格式
import numpy as np #用于展示横坐标
from matplotlib import pyplot as plt #用于绘制折线图
population = pd.read_csv(r"LineChart/level1/csv/world-population.csv") #返回值为二维标记数据结构 DataFrame
def plot():
# ********* Begin *********#
fig,ax=plt.subplots()
my_x_ticks = np.arange(1960, 2011, 5)
plt.xticks(my_x_ticks)
plt.grid(b=True, color='r', linestyle='--', linewidth=1, alpha=0.3, axis='x', which="major") #设置网格
ax.plot(population["Year"],population["Population"], linewidth=1, c='#00CC88', marker='*', markersize=4) #绘制点和折线
ax.set_xlabel("Year", fontsize=12) #设置x轴标签
ax.set_ylabel("Population", fontsize=12)
# ********* End *********#
plt.savefig('LineChart/level1/studentanswer/world-population.png') #保存为png格式
plt.close() #关闭画布窗口
Python数据可视化之柱形图
- 第1关 “大胃王”比赛数据柱形图绘制——绘制柱形图的基本步骤
# -*- coding: utf-8 -*-
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
hot_dog = pd.read_csv(r"matplotlib_bar/csv/hot-dog-contest-winners.csv")
def plot():
# ********* Begin *********#
fig, ax = plt.subplots() #subplots返回画布和子图
ax.bar(hot_dog["Year"],hot_dog["Dogs eaten"]) #绘制柱形图,第一个参数为x轴变量,第二个参数为y轴变量
plt.show()
# ********* End *********#
plt.savefig('matplotlib_bar/studentfile/studentanswer/level_1/US.png')
plt.close()
- 第2关 “大胃王”比赛数据柱形图绘制——柱形图展示优化
# -*- coding: utf-8 -*-
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
hot_dog = pd.read_csv(r"matplotlib_bar/csv/hot-dog-contest-winners.csv")
def plot():
# ********* Begin *********#
fig, ax = plt.subplots()
ax.bar(hot_dog["Year"],hot_dog["Dogs eaten"],width=[0.6],color=unitedStatesColor())
plt.rcParams['figure.figsize'] = (8.0, 4.0)
ax.set_xlabel("Year") #设置x轴标签
ax.set_ylabel("Dogs Eaten") #设置y轴标签
ax.set_title("Hotdog game scores 1980-2010") #设置标题
ax.set_xlim(1979,2011)
plt.rcParams['figure.figsize'] = (8.0, 4.0)
plt.show()
# ********* End *********#
plt.savefig('matplotlib_bar/studentfile/studentanswer/level_2/US.png')
plt.close()
def unitedStatesColor():
# ********* Begin *********#
list=[]
for i in hot_dog["Country"]:
if i=="United States":
list.append("#DB7093") #打破记录的年份显示为粉红色
else:
list.append("#5F9F9F") #其余年份显示为灰绿色
return list
# ********* End *********#
Python数据可视化之散点图
- 第1关 美国犯罪率数据散点图绘制——散点图的基本绘制步骤
# -*- coding: utf-8 -*-
import pandas as pd #用于生成满足绘图要求的数据格式
from matplotlib import pyplot as plt #用于绘制散点图
import statsmodels.api as sm #用于局部加权回归
from matplotlib.backends.backend_pdf import PdfPages
crime=pd.read_csv(r"matplotlibScatter/csv/crimeRatesByState2005.csv") #返回值为二维标记数据结构 DataFrame
def plot():
# ********* Begin *********#
fig,ax=plt.subplots() #subplots返回画布和子图
crime2=crime[~crime['state'].isin(['District of Columbia','United States'])] #获取没有全美平均值和华盛顿特区的犯罪率数据
ax.plot(crime2["murder"],crime2["burglary"],"*",color="#00CC88")
ax.set_xlabel("crime murder", fontsize=12) #设置x轴标签
ax.set_ylabel("crime burglary", fontsize=12)
ax.set_xlim(0,10) #x轴范围从0到10
ax.set_ylim(0,1200)
plt.show()
# ********* End *********#
plt.savefig('matplotlibScatter/studentanswer/level_1/crime.png') #保存为png格式
plt.close() #关闭画布窗口
- 第2关 美国犯罪率数据散点图绘制——局部加权回归
# -*- coding: utf-8 -*-
import pandas as pd #用于生成满足绘图要求的数据格式
from matplotlib import pyplot as plt#用于绘制散点图
import statsmodels.api as sm #用于局部加权回归
crime=pd.read_csv(r"matplotlibScatter/csv/crimeRatesByState2005.csv") #返回值为二维标记数据结构 DataFrame
def plot():
# ********* Begin *********#
plt.figure(figsize=(8,4))
fig,ax=plt.subplots()
crime2=crime[~crime['state'].isin(['District of Columbia','United States'])]
lowess = sm.nonparametric.lowess(crime2["burglary"],crime2["murder"])
ax.plot( lowess[ :,0],lowess[ :,1])
ax.plot(crime2["murder" ], crime2["burglary"],"*",color="#00CC88")
ax.set_xlabel("crime murder" ,fontsize=12)
ax.set_ylabel("crime burglary" ,fontsize=12)
ax.set_title("美国谋杀率和入室盗窃率",fontproperties="SimHei",fontsize=16)
ax.set_xlim(0,10)
ax.set_ylim(0,1200)
plt.show()
# ********* End *********#
plt.savefig('matplotlibScatter/studentanswer/level_2/crime.png') #保存为png格式
plt.close() #关闭画布窗口
Python数据可视化之多维量法(MDS)
- 第1关 美国国家教育统计中心数据——降维
# -*- coding: utf-8 -*-
import pandas as pd #用于生成满足绘图要求的数据格式
from sklearn.manifold import MDS #用于MDS降维
import matplotlib.pyplot as plt #用于绘制撒点图
from sklearn.cluster import KMeans #用于Kmeans聚类
from scipy.spatial import distance #用于计算获取距离矩阵
edu=pd.read_csv(r"MDS/csv/education.csv") #读取csv数据,返回值为二维标记数据结构 DataFrame
def plot():
# ********* Begin *********#
edu_x=edu.iloc[:,1:7] #选择edu中的第 1 列到第 6 列
DM_dist = distance.squareform(distance.pdist(edu_x, metric="euclidean")) #计算距离矩阵
clf2 = MDS(n_components=2,dissimilarity="precomputed")
edu_t2 = clf2.fit_transform(DM_dist)
fig,ax=plt.subplots()
ax.scatter(edu_t2[:,0],edu_t2[:,1])
names=list(edu.iloc[:,0])
for i in range(len(names)):
plt.annotate(names[i], xy = (edu_t2[:,0][i],edu_t2[:,1][i]), xytext=(-20, 5), textcoords='offset points')
# ********* End *********#
plt.savefig("MDS/studentanswer/level_1/education.png")
plt.close()
- 第2关 美国国家教育统计中心数据——分别按特征和聚类结果着色
# -*- coding: utf-8 -*-
import pandas as pd #用于生成满足绘图要求的数据格式
from sklearn.manifold import MDS #用于MDS降维
import matplotlib.pyplot as plt #用于绘制撒点图
from sklearn.cluster import KMeans #用于Kmeans聚类
from scipy.spatial import distance #用于计算获取距离矩阵
edu=pd.read_csv(r"MDS/csv/education.csv") #读取csv数据,返回值为二维标记数据结构 DataFrame
def plot():
# ********* Begin *********#
edu_x=edu.iloc[:,1:7] #选择edu中的第 1 列到第 6 列
DM_dist = distance.squareform(distance.pdist(edu_x, metric="euclidean")) #计算距离矩阵
clf2 = MDS(n_components=2,dissimilarity="precomputed")
edu_t2 = clf2.fit_transform(DM_dist)
fig,ax=plt.subplots()
reading_colors_list=[]
average=sum(edu_x["reading"])/len(edu_x["reading"]) #计算阅读平均值
for i in range(0,len(edu_x["reading"])):
if edu_x["reading"][i] < average:
reading_colors_list.append("#DB7093") #小于平均值的数据为粉红色,并添加到颜色列表
else:
reading_colors_list.append("#5F9F9F") #大于平均值的数据为灰绿色,并添加到颜色列
ax.scatter(edu_t2[:,0],edu_t2[:,1],color=reading_colors_list)
names=list(edu.iloc[:,0]) #选择州名这一列数据
for i in range(len(names)):
plt.annotate(names[i], xy = (edu_t2[:,0][i],edu_t2[:,1][i]), xytext=(-20, 5), textcoords='offset points',color=reading_colors_list[i])
plt.show()
# ********* End *********#
plt.savefig("MDS/studentanswer/level_2/education.png")
plt.close()
以上是关于大数据分析及应用的主要内容,如果未能解决你的问题,请参考以下文章
AWS 线下研讨会 | 关于数据备份及大数据分析应用方案的那些事儿