python数据挖掘作业____商品间/商品内分析
Posted 孤影化双皮奶
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python数据挖掘作业____商品间/商品内分析相关的知识,希望对你有一定的参考价值。
# -*- coding: utf-8 -*- # 代码8-1 查看数据特征 import numpy as np import pandas as pd inputfile = \'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv\' # 输入的数据文件 data = pd.read_csv(inputfile,encoding = \'gbk\') # 读取数据 data .info() # 查看数据属性 data = data[\'id\'] description = [data.count(),data.min(), data.max()] # 依次计算总数、最小值、最大值 description = pd.DataFrame(description, index = [\'Count\',\'Min\', \'Max\']).T # 将结果存入数据框 print(\'描述性统计结果:\\n\',np.round(description)) # 输出结果
# 销量排行前10商品的销量及其占比 import pandas as pd inputfile = \'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv\' # 输入的数据文件 data = pd.read_csv(inputfile,encoding = \'gbk\') # 读取数据 group = data.groupby([\'Goods\']).count().reset_index() # 对商品进行分类汇总 sorted=group.sort_values(\'id\',ascending=False) print(\'销量排行前10商品的销量:\\n\', sorted[:10]) # 排序并查看前10位热销商品
# 画条形图展示出销量排行前10商品的销量 import matplotlib.pyplot as plt x=sorted[:10][\'Goods\'] y=sorted[:10][\'id\'] plt.figure(figsize = (8, 4)) # 设置画布大小 plt.barh(x,y) plt.rcParams[\'font.sans-serif\'] = \'SimHei\' plt.xlabel(\'销量\') # 设置x轴标题 plt.ylabel(\'商品类别\') # 设置y轴标题 plt.title(\'商品的销量TOP10--3009\') # 设置标题 plt.savefig(\'D://CourseAssignment//AI//GoodSellMod//tmp//top10.png\') # 把图片以.png格式保存 plt.show() # 展示图片
# 销量排行前10商品的销量占比 data_nums = data.shape[0] for idnex, row in sorted[:10].iterrows(): print(row[\'Goods\'],row[\'id\'],row[\'id\']/data_nums)
# 代码8-3 各类别商品的销量及其占比 import pandas as pd inputfile1 = \'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv\' inputfile2 = \'D://CourseAssignment//AI//GoodSellMod//GoodsTypes.csv\' data = pd.read_csv(inputfile1,encoding = \'gbk\') types = pd.read_csv(inputfile2,encoding = \'gbk\') # 读入数据 group = data.groupby([\'Goods\']).count().reset_index() sort = group.sort_values(\'id\',ascending = False).reset_index() data_nums = data.shape[0] # 总量 del sort[\'index\'] sort_links = pd.merge(sort,types) # 合并两个datafreame 根据type # 根据类别求和,每个商品类别的总量,并排序 sort_link = sort_links.groupby([\'Types\']).sum().reset_index() sort_link = sort_link.sort_values(\'id\',ascending = False).reset_index() del sort_link[\'index\'] # 删除“index”列 # 求百分比,然后更换列名,最后输出到文件 sort_link[\'count\'] = sort_link.apply(lambda line: line[\'id\']/data_nums,axis=1) sort_link.rename(columns = \'count\':\'percent\',inplace = True) print(\'各类别商品的销量及其占比:\\n\',sort_link) outfile1 = \'D://CourseAssignment//AI//GoodSellMod//tmp//percent.csv\' sort_link.to_csv(outfile1,index = False,header = True,encoding=\'gbk\') # 保存结果
# 画饼图展示每类商品销量占比 import matplotlib.pyplot as plt data = sort_link[\'percent\'] labels = sort_link[\'Types\'] plt.figure(figsize=(8, 6)) # 设置画布大小 plt.pie(data,labels=labels,autopct=\'%1.2f%%\') plt.rcParams[\'font.sans-serif\'] = \'SimHei\' plt.title(\'每类商品销量占比--3009\') # 设置标题 plt.savefig(\'D://CourseAssignment//AI//GoodSellMod//tmp//persent.png\') # 把图片以.png格式保存 plt.show()
# 代码8-4 非酒精饮料内部商品的销量及其占比 # 先筛选“非酒精饮料”类型的商品,然后求百分比,然后输出结果到文件。 selected = sort_links.loc[sort_links[\'Types\'] == \'非酒精饮料\'] # 挑选商品类别为“非酒精饮料”并排序 child_nums = selected[\'id\'].sum() # 对所有的“非酒精饮料”求和 selected[\'child_percent\'] = selected.apply(lambda line: line[\'id\']/child_nums,axis = 1) # 求百分比 selected.rename(columns = \'id\':\'count\',inplace = True) print(\'非酒精饮料内部商品的销量及其占比:\\n\',selected) outfile2 = \'D://CourseAssignment//AI//GoodSellMod//tmp//child_percent.csv\' sort_link.to_csv(outfile2,index = False,header = True,encoding=\'gbk\') # 输出结果
# 画饼图展示非酒精饮品内部各商品的销量占比 import matplotlib.pyplot as plt data = selected[\'child_percent\'] labels = selected[\'Goods\'] plt.figure(figsize = (8,6)) # 设置画布大小 explode = (0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.08,0.3,0.1,0.3) # 设置每一块分割出的间隙大小 plt.pie(data,explode = explode,labels = labels,autopct = \'%1.2f%%\', pctdistance = 1.1,labeldistance = 1.2) plt.rcParams[\'font.sans-serif\'] = \'SimHei\' plt.title("非酒精饮料内部各商品的销量占比--3009") # 设置标题 plt.axis(\'equal\') plt.savefig(\'D://CourseAssignment//AI//GoodSellMod//tmp//child_persent.png\') # 保存图形 plt.show() # 展示图形
# 先筛选“西点”类型的商品,然后求百分比,然后输出结果到文件。 selected = sort_links.loc[sort_links[\'Types\'] == \'西点\'] # 挑选商品类别为“非酒精饮料”并排序 child_nums = selected[\'id\'].sum() # 对所有的“非酒精饮料”求和 selected[\'child_percent\'] = selected.apply(lambda line: line[\'id\']/child_nums,axis = 1) # 求百分比 selected.rename(columns = \'id\':\'count\',inplace = True) print(\'西点内部商品的销量及其占比:\\n\',selected) outfile3 = \'D://CourseAssignment//AI//GoodSellMod//tmp//bread_precent.csv\' sort_link.to_csv(outfile3,index = False,header = True,encoding=\'gbk\') # 输出结果
# 画饼图展示西点内部各商品的销量占比 import matplotlib.pyplot as plt data = selected[\'child_percent\'] labels = selected[\'Goods\'] plt.figure(figsize = (8,6)) # 设置画布大小 explode = (0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.08,0.3,0.1,0.3) # 设置每一块分割出的间隙大小 plt.pie(data,explode = None,labels = labels,autopct = \'%1.2f%%\', pctdistance = 1.1,labeldistance = 1.2) plt.rcParams[\'font.sans-serif\'] = \'SimHei\' plt.title("西点内部各商品的销量占比--3009") # 设置标题 plt.axis(\'equal\') plt.savefig(\'D://CourseAssignment//AI//GoodSellMod//tmp//bread_precent.png\') # 保存图形 plt.show() # 展示图形
# -*- coding: utf-8 -*- import pandas as pd inputfile=\'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv\' data = pd.read_csv(inputfile,encoding = \'gbk\') # 根据id对“Goods”列合并,并使用“,”将各商品隔开 data[\'Goods\'] = data[\'Goods\'].apply(lambda x:\',\'+x) data = data.groupby(\'id\').sum().reset_index() # 对合并的商品列转换数据格式 data[\'Goods\'] = data[\'Goods\'].apply(lambda x :[x[1:]]) data_list = list(data[\'Goods\']) # 分割商品名为每个元素 data_translation = [] for i in data_list: p = i[0].split(\',\') data_translation.append(p) for i in range(9): print(\'数据转换结果的第 \',i+1,\' 个元素:\', data_translation[i]) #print(\'数据转换结果的前9个元素:\\n\', data_translation[0:8]) ######################################################### def loadSimpleData(): for i in range(3): print("********") \'\'\' simpleData = [[\'beer\', \'milk\', \'chicken\'], [\'milk\', \'bread\'], [\'milk\', \'diaper\'], [\'beer\', \'milk\', \'bread\'], [\'beer\', \'diaper\'], [\'milk\', \'diaper\'], [\'beer\', \'diaper\'], [\'beer\', \'milk\', \'diaper\', \'chicken\'], [\'beer\', \'milk\', \'diaper\']] return simpleData \'\'\' simpleData = [data_translation[0], data_translation[1], data_translation[2], data_translation[3], data_translation[4], data_translation[5], data_translation[6], data_translation[7], data_translation[8]] return simpleData def createInitSet(dataSet: list) -> dict: returnSet = for item in dataSet: frozenItem = frozenset(item) returnSet[frozenItem] = returnSet.get(frozenItem, 0) + 1 return returnSet class TreeNode(object): def __init__(self, nameValue: str, numOccur: int, parentNode): # 项的名字 self.name = nameValue # 项在FPTree当中出现的次数 self.count = numOccur # 相同项的下一个节点 self.nodeLink = None # 父节点 self.parentNode = parentNode # 子节点 # for example, the children like \'milk\': TreeNode(\'milk\') self.children = def inc(self, count): self.count += count def show(self, ind=1): print(\' \' * ind, self.name, \' \', self.count) for child in self.children.values(): child.show(ind + 1) def getHeaderTable(dataSet, minSupport=1) -> dict: headerTable = for key, value in dataSet.items(): for item in key: headerTable[item] = headerTable.get(item, 0) + value lessThanMinSupportList = list(filter(lambda k: headerTable[k] < minSupport, headerTable)) for x in lessThanMinSupportList: del headerTable[x] return headerTable def makeHeaderTable(headerTable: dict) -> dict: for item in headerTable: headerTable[item] = [headerTable[item], None] return headerTable def updateHeaderTable(toastNode: TreeNode, targetNode: TreeNode): while toastNode.nodeLink is not None: toastNode = toastNode.nodeLink toastNode.nodeLink = targetNode class FPTree: def __init__(self, frozenDataDict: dict, headerTable: dict, minSupport: int): self.treeNode = TreeNode(\'null\', 1, None) # \'milk\': [counter, nodeLink] self.headerTable = makeHeaderTable(headerTable) self.frozenDataDict = frozenDataDict self.minSupport = minSupport def updateTree(self, treeNode, items: list, count: int): item = items[0] if item in treeNode.children: treeNode.children[item].inc(count) else: treeNode.children[item] = TreeNode(item, count, treeNode) if self.headerTable[item][1] is None: self.headerTable[item][1] = treeNode.children[item] else: updateHeaderTable(self.headerTable[item][1], treeNode.children[item]) if len(items) > 1: self.updateTree(treeNode.children[item], items[1::], count) def createFPTree(self): freqItems = set(self.headerTable.keys()) if len(freqItems) == 0: self.headerTable = None return for transaction, count in self.frozenDataDict.items(): learnSet = for item in transaction: if item in freqItems: learnSet[item] = self.headerTable[item][0] if len(learnSet) > 0: orderedItems = [item[0] for item in sorted(learnSet.items(), key=lambda k: (k[1], k[0]), reverse=True)] self.updateTree(self.treeNode, orderedItems, count) def main(): data = loadSimpleData() dataDict = createInitSet(data) headerTable = getHeaderTable(dataDict, 3) fpTree = FPTree(dataDict, headerTable, 3) fpTree.createFPTree() fpTree.treeNode.show() main()
以上是关于python数据挖掘作业____商品间/商品内分析的主要内容,如果未能解决你的问题,请参考以下文章
ElasticSearch_05_ES的嵌套聚合,下钻分析,聚合分析
ElasticSearch_05_ES的嵌套聚合,下钻分析,聚合分析
ElasticSearch_04_ES的嵌套聚合,下钻分析,聚合分析