import requests
from lxml import etree
from pymysql import *
from tkinter import *
from urllib import parse
window = Tk()
window.title("天猫列表商品采集")
window.geometry(‘200x180‘)
Label(window, text=‘关键字采集‘).pack()
name = StringVar()
Entry(window, textvariable=name).pack()
Label(window, text=‘采集起始页‘).pack()
to_page = StringVar()
Entry(window, textvariable=to_page).pack()
Label(window, text=‘采集结束页‘).pack()
w_page = StringVar()
Entry(window, textvariable=w_page).pack()
class Tmall(object):
def __init__(self,name,to_page, w_page):
content = parse.quote(name, encoding=‘gbk‘)
self.url = ‘https://list.tmall.com/search_product.htm?s={}&q=‘ + content
self.headers = {‘accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,
‘accept-encoding‘:‘gzip, deflate, br‘,
‘accept-language‘:‘zh-CN,zh;q=0.9‘,
‘cache-control‘:‘max-age=0‘,
‘cookie‘:‘t=7a76b0e2463b7da48fec3e6c41b7db7b; _tb_token_=5d36638e3a354; cookie2=11020327dbcec48c00a5f04268565e93‘,
‘upgrade-insecure-requests‘:‘1‘,
‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36‘}
self.page = to_page * 60
self.url_list = []
for i in range(to_page, w_page + 1):
self.page += 60
self.url_list.append(self.url.format(self.page))
# 请求url
def get_data(self, url):
response = requests.get(url, headers=self.headers, timeout=10)
return response.content.decode(‘gbk‘)
# 解析数据
def parse_data(self, data):
# 将源码数据转换成element对象
html = etree.HTML(data)
# 获取节点列表
node_list = html.xpath(‘//*[@id="J_ItemList"]/div/div‘)
data_list = []
for i,node in enumerate(node_list):
img_a = ‘‘.join(node.xpath(‘./div[1]/a/img/@data-ks-lazyload‘))
img_b = ‘‘.join(node.xpath(‘./div[1]/a/img/@src‘))
# 图片链接
img_link = img_a if i > 4 else img_b
# 商品链接
link_url = ‘‘.join(node.xpath(‘./p[2]/a/@href‘))
goods_link = link_url if link_url else "空"
# 商品价格
prices = ‘‘.join(node.xpath(‘./p[1]/em/@title‘))
price = prices if prices else "空"
# 商品名称
titles = ‘‘.join(node.xpath(‘./p[2]/a/@title‘))
title = titles if titles else "空"
# 公司
company_a = ‘‘.join(node.xpath(‘./div[3]/a/text()‘))
company_b = ‘‘.join(node.xpath(‘./div[2]/a/text()‘))
company = company_a if self.page == 60 else company_b
# 月交易
deal_counts = ‘‘.join(node.xpath(‘./p[3]/span[1]/em/text()‘))
deal_count = deal_counts if deal_counts else "空"
# 评论数
comment_counts = ‘‘.join(node.xpath(‘./p[3]/span[2]/a/text()‘))
comment_count = comment_counts if comment_counts else "空"
time = {
‘name‘:‘天猫‘,
‘img_link‘:img_link,
‘goods_link‘:goods_link,
‘price‘:price,
‘title‘:title,
‘company‘:company,
‘deal_count‘:deal_count,
‘comment_count‘:comment_count,
}
data_list.append(time)
return data_list
# 保存数据
def save_data(self, data_list):
try:
conn = Connect(host="127.0.0.1", user="root", password="root", database="data_list", port=3306,
charset="utf8")
cs1 = conn.cursor()
# 执行insert语句,并返回受影响的行数:添加一条数据
for index, data in enumerate(data_list):
count = cs1.execute(
‘insert into data(name,goods_link,img_link,title,price,company,deal_count,comment_count) values("%s","%s","%s","%s","%s","%s","%s","%s")‘ % (
data[‘name‘], data[‘goods_link‘], data[‘img_link‘], data[‘title‘], data[‘price‘],
data[‘company‘], data[‘deal_count‘], data[‘comment_count‘]))
# 关闭Cursor对象
print(count)
cs1.close()
# 提交之前的操作,此处为insert操作
conn.commit()
except Exception as e:
‘‘‘吧报错信息写入log日志‘‘‘
with open(‘log.txt‘, ‘a‘) as f:
f.write(repr(e) + ‘\\n‘)
finally:
# 关闭Connection对象
conn.close()
def run(self):
# 构建url
# 构建请求头
# 发起请求
for url in self.url_list:
data = self.get_data(url)
# 解析响应,抽取数据
data_list = self.parse_data(data)
# 保存数据
self.save_data(data_list)
def main():
n = str(name.get())
t = int(to_page.get())
w = int(w_page.get())
all = Tmall(n,t,w)
all.run()
if __name__ == ‘__main__‘:
Button(window, text="确定", relief=‘groove‘, width=9, height=1, bd=4, command=main).pack()
window.mainloop()