python数据挖掘作业____商品间/商品内分析

Posted 孤影化双皮奶

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python数据挖掘作业____商品间/商品内分析相关的知识,希望对你有一定的参考价值。

 # -*- coding: utf-8 -*-
 
 # 代码8-1 查看数据特征
 
 import numpy as np
 import pandas as pd
 
 inputfile = \'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv\'   # 输入的数据文件
 data = pd.read_csv(inputfile,encoding = \'gbk\')  # 读取数据
 data .info()  # 查看数据属性
 
 data = data[\'id\']
 description = [data.count(),data.min(), data.max()]  # 依次计算总数、最小值、最大值
 description = pd.DataFrame(description, index = [\'Count\',\'Min\', \'Max\']).T  # 将结果存入数据框
 print(\'描述性统计结果:\\n\',np.round(description))  # 输出结果

 

 # 销量排行前10商品的销量及其占比
 import pandas as pd
 inputfile = \'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv\'  # 输入的数据文件
 data = pd.read_csv(inputfile,encoding = \'gbk\')  # 读取数据
 group = data.groupby([\'Goods\']).count().reset_index()  # 对商品进行分类汇总
 sorted=group.sort_values(\'id\',ascending=False)
 print(\'销量排行前10商品的销量:\\n\', sorted[:10])  # 排序并查看前10位热销商品

 

 

 # 画条形图展示出销量排行前10商品的销量
 import matplotlib.pyplot as plt
 x=sorted[:10][\'Goods\']
 y=sorted[:10][\'id\']
 plt.figure(figsize = (8, 4))  # 设置画布大小 
 plt.barh(x,y)
 plt.rcParams[\'font.sans-serif\'] = \'SimHei\'
 plt.xlabel(\'销量\')  # 设置x轴标题
 plt.ylabel(\'商品类别\')  # 设置y轴标题
 plt.title(\'商品的销量TOP10--3009\')  # 设置标题
 plt.savefig(\'D://CourseAssignment//AI//GoodSellMod//tmp//top10.png\')  # 把图片以.png格式保存
 plt.show()  # 展示图片

 

 

 # 销量排行前10商品的销量占比
 data_nums = data.shape[0]
 for idnex, row in sorted[:10].iterrows():
     print(row[\'Goods\'],row[\'id\'],row[\'id\']/data_nums)

 

 

 # 代码8-3 各类别商品的销量及其占比
 
 import pandas as pd
 inputfile1 = \'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv\'
 inputfile2 = \'D://CourseAssignment//AI//GoodSellMod//GoodsTypes.csv\'
 data = pd.read_csv(inputfile1,encoding = \'gbk\')
 types = pd.read_csv(inputfile2,encoding = \'gbk\')  # 读入数据
 
 group = data.groupby([\'Goods\']).count().reset_index()
 sort = group.sort_values(\'id\',ascending = False).reset_index()
 data_nums = data.shape[0]  # 总量
 del sort[\'index\']
 
 sort_links = pd.merge(sort,types)  # 合并两个datafreame 根据type
 # 根据类别求和,每个商品类别的总量,并排序
 sort_link = sort_links.groupby([\'Types\']).sum().reset_index()
 sort_link = sort_link.sort_values(\'id\',ascending = False).reset_index()
 del sort_link[\'index\']  # 删除“index”列
 
 # 求百分比,然后更换列名,最后输出到文件
 sort_link[\'count\'] = sort_link.apply(lambda line: line[\'id\']/data_nums,axis=1)
 sort_link.rename(columns = \'count\':\'percent\',inplace = True)
 print(\'各类别商品的销量及其占比:\\n\',sort_link)
 outfile1 = \'D://CourseAssignment//AI//GoodSellMod//tmp//percent.csv\'
 sort_link.to_csv(outfile1,index = False,header = True,encoding=\'gbk\')  # 保存结果

 

 

 # 画饼图展示每类商品销量占比
 import matplotlib.pyplot as plt
 data = sort_link[\'percent\']
 labels = sort_link[\'Types\']
 plt.figure(figsize=(8, 6))  # 设置画布大小   
 plt.pie(data,labels=labels,autopct=\'%1.2f%%\')
 plt.rcParams[\'font.sans-serif\'] = \'SimHei\'
 plt.title(\'每类商品销量占比--3009\')  # 设置标题
 plt.savefig(\'D://CourseAssignment//AI//GoodSellMod//tmp//persent.png\')  # 把图片以.png格式保存
 plt.show()

 

 

 # 代码8-4 非酒精饮料内部商品的销量及其占比
 
 # 先筛选“非酒精饮料”类型的商品,然后求百分比,然后输出结果到文件。
 selected = sort_links.loc[sort_links[\'Types\'] == \'非酒精饮料\']  # 挑选商品类别为“非酒精饮料”并排序
 child_nums = selected[\'id\'].sum()  # 对所有的“非酒精饮料”求和
 selected[\'child_percent\'] = selected.apply(lambda line: line[\'id\']/child_nums,axis = 1)  # 求百分比
 selected.rename(columns = \'id\':\'count\',inplace = True)
 print(\'非酒精饮料内部商品的销量及其占比:\\n\',selected)
 outfile2 = \'D://CourseAssignment//AI//GoodSellMod//tmp//child_percent.csv\'
 sort_link.to_csv(outfile2,index = False,header = True,encoding=\'gbk\')  # 输出结果

 

 

 # 画饼图展示非酒精饮品内部各商品的销量占比
 import matplotlib.pyplot as plt
 data = selected[\'child_percent\']
 labels = selected[\'Goods\']
 plt.figure(figsize = (8,6))  # 设置画布大小 
 explode = (0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.08,0.3,0.1,0.3)  # 设置每一块分割出的间隙大小
 plt.pie(data,explode = explode,labels = labels,autopct = \'%1.2f%%\',
         pctdistance = 1.1,labeldistance = 1.2)
 plt.rcParams[\'font.sans-serif\'] = \'SimHei\'
 plt.title("非酒精饮料内部各商品的销量占比--3009")  # 设置标题
 plt.axis(\'equal\')
 plt.savefig(\'D://CourseAssignment//AI//GoodSellMod//tmp//child_persent.png\')  # 保存图形
 plt.show()  # 展示图形

 

 

 # 先筛选“西点”类型的商品,然后求百分比,然后输出结果到文件。
 selected = sort_links.loc[sort_links[\'Types\'] == \'西点\']  # 挑选商品类别为“非酒精饮料”并排序
 child_nums = selected[\'id\'].sum()  # 对所有的“非酒精饮料”求和
 selected[\'child_percent\'] = selected.apply(lambda line: line[\'id\']/child_nums,axis = 1)  # 求百分比
 selected.rename(columns = \'id\':\'count\',inplace = True)
 print(\'西点内部商品的销量及其占比:\\n\',selected)
 outfile3 = \'D://CourseAssignment//AI//GoodSellMod//tmp//bread_precent.csv\'
 sort_link.to_csv(outfile3,index = False,header = True,encoding=\'gbk\')  # 输出结果

 

 

 # 画饼图展示西点内部各商品的销量占比
 import matplotlib.pyplot as plt
 data = selected[\'child_percent\']
 labels = selected[\'Goods\']
 plt.figure(figsize = (8,6))  # 设置画布大小
 explode = (0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.08,0.3,0.1,0.3)  # 设置每一块分割出的间隙大小
 plt.pie(data,explode = None,labels = labels,autopct = \'%1.2f%%\',
         pctdistance = 1.1,labeldistance = 1.2)
 plt.rcParams[\'font.sans-serif\'] = \'SimHei\'
 plt.title("西点内部各商品的销量占比--3009")  # 设置标题
 plt.axis(\'equal\')
 plt.savefig(\'D://CourseAssignment//AI//GoodSellMod//tmp//bread_precent.png\')  # 保存图形
 plt.show()  # 展示图形

 

 

 # -*- coding: utf-8 -*-
 import pandas as pd
 inputfile=\'D://CourseAssignment//AI//GoodSellMod//GoodsOrder.csv\'
 data = pd.read_csv(inputfile,encoding = \'gbk\')
 
 # 根据id对“Goods”列合并,并使用“,”将各商品隔开
 data[\'Goods\'] = data[\'Goods\'].apply(lambda x:\',\'+x)
 data = data.groupby(\'id\').sum().reset_index()
 
 # 对合并的商品列转换数据格式
 data[\'Goods\'] = data[\'Goods\'].apply(lambda x :[x[1:]])
 data_list = list(data[\'Goods\'])
 
 # 分割商品名为每个元素
 data_translation = []
 for i in data_list:
     p = i[0].split(\',\')
     data_translation.append(p)
 for i in range(9):
     print(\'数据转换结果的第 \',i+1,\' 个元素:\', data_translation[i])
 #print(\'数据转换结果的前9个元素:\\n\', data_translation[0:8])
 
 
 #########################################################
 
 
 def loadSimpleData():
     for i in range(3):
         print("********")
     \'\'\'
     simpleData = [[\'beer\', \'milk\', \'chicken\'], 
                  [\'milk\', \'bread\'], 
                  [\'milk\', \'diaper\'],
                  [\'beer\', \'milk\', \'bread\'], 
                  [\'beer\', \'diaper\'], 
                  [\'milk\', \'diaper\'],
                  [\'beer\', \'diaper\'], 
                  [\'beer\', \'milk\', \'diaper\', \'chicken\'], 
                  [\'beer\', \'milk\', \'diaper\']]
     return simpleData
     \'\'\'
     simpleData = [data_translation[0],
                   data_translation[1],
                   data_translation[2],
                   data_translation[3],
                   data_translation[4],
                   data_translation[5],
                   data_translation[6],
                   data_translation[7],
                   data_translation[8]]
     return simpleData
 
 
 
 def createInitSet(dataSet: list) -> dict:
     returnSet = 
 
     for item in dataSet:
         frozenItem = frozenset(item)
         returnSet[frozenItem] = returnSet.get(frozenItem, 0) + 1
 
     return returnSet
 
 
 class TreeNode(object):
     def __init__(self, nameValue: str, numOccur: int, parentNode):
         # 项的名字
         self.name = nameValue
         # 项在FPTree当中出现的次数
         self.count = numOccur
         # 相同项的下一个节点
         self.nodeLink = None
         # 父节点
         self.parentNode = parentNode
         # 子节点
         # for example, the children like \'milk\': TreeNode(\'milk\')
         self.children = 
 
     def inc(self, count):
         self.count += count
 
     def show(self, ind=1):
         print(\'   \' * ind, self.name, \' \', self.count)
         for child in self.children.values():
             child.show(ind + 1)
 
 
 def getHeaderTable(dataSet, minSupport=1) -> dict:
     headerTable = 
 
     for key, value in dataSet.items():
         for item in key:
             headerTable[item] = headerTable.get(item, 0) + value
 
     lessThanMinSupportList = list(filter(lambda k: headerTable[k] < minSupport, headerTable))
     for x in lessThanMinSupportList:
         del headerTable[x]
 
     return headerTable
 
 
 def makeHeaderTable(headerTable: dict) -> dict:
     for item in headerTable:
         headerTable[item] = [headerTable[item], None]
 
     return headerTable
 
 
 def updateHeaderTable(toastNode: TreeNode, targetNode: TreeNode):
     while toastNode.nodeLink is not None:
         toastNode = toastNode.nodeLink
     toastNode.nodeLink = targetNode
 
 
 class FPTree:
     def __init__(self, frozenDataDict: dict, headerTable: dict, minSupport: int):
         self.treeNode = TreeNode(\'null\', 1, None)
         # \'milk\': [counter, nodeLink]
         self.headerTable = makeHeaderTable(headerTable)
         self.frozenDataDict = frozenDataDict
         self.minSupport = minSupport
 
     def updateTree(self, treeNode, items: list, count: int):
         item = items[0]
         if item in treeNode.children:
             treeNode.children[item].inc(count)
         else:
             treeNode.children[item] = TreeNode(item, count, treeNode)
             if self.headerTable[item][1] is None:
                 self.headerTable[item][1] = treeNode.children[item]
             else:
                 updateHeaderTable(self.headerTable[item][1], treeNode.children[item])
         if len(items) > 1:
             self.updateTree(treeNode.children[item], items[1::], count)
 
     def createFPTree(self):
         freqItems = set(self.headerTable.keys())
 
         if len(freqItems) == 0:
             self.headerTable = None
             return
 
         for transaction, count in self.frozenDataDict.items():
             learnSet = 
             for item in transaction:
                 if item in freqItems:
                     learnSet[item] = self.headerTable[item][0]
 
             if len(learnSet) > 0:
                 orderedItems = [item[0] for item in sorted(learnSet.items(), key=lambda k: (k[1], k[0]), reverse=True)]
                 self.updateTree(self.treeNode, orderedItems, count)
 
 
 def main():
     data = loadSimpleData()
     dataDict = createInitSet(data)
     headerTable = getHeaderTable(dataDict, 3)
     fpTree = FPTree(dataDict, headerTable, 3)
     fpTree.createFPTree()
     fpTree.treeNode.show()
 
 main()

 

以上是关于python数据挖掘作业____商品间/商品内分析的主要内容,如果未能解决你的问题,请参考以下文章

淘淘商城02——dubbo框架整合_商品列表查询实现_分页

ElasticSearch_05_ES的嵌套聚合,下钻分析,聚合分析

ElasticSearch_05_ES的嵌套聚合,下钻分析,聚合分析

ElasticSearch_04_ES的嵌套聚合,下钻分析,聚合分析

ElasticSearch_05_ES的嵌套聚合,下钻分析,聚合分析

Python 爬虫知识点 - 淘宝商品检索结果抓包分析(续二)