在线商城爬虫 带爬取记录 以11TREET 为例

Posted 是黄天师哈哈哈

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了在线商城爬虫 带爬取记录 以11TREET 为例相关的知识,希望对你有一定的参考价值。

整体思路

第一步  抓取全部的列表页链接

第二步 抓取每个列表页的商品总数,页数 

第三步 单个列表页 进行分业 抓取商品价格

第四步 单个列表页抓取完成后 输出商品数据并在本地文件记录本次抓取

最后一步 合并各个列页表抓取的商品数据

 

第一步

爬取的网站,获得分类信息

https://global.11st.co.kr/glb/

 

 

 这里分类是动态加载的,需要用selenium + chromedriver

代码如下

import  requests,random,os,math,time,re,pandas as pd,numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver

#chomedriver 地址
CHROME_DRIVER_PATH = \'/Users/xxxx/Downloads/chromedriver\'



#爬取动态界面
def get_dynamic_html(site_url):
    print(\'开始加载\',site_url,\'动态页面\')
    chrome_options = webdriver.ChromeOptions()
    #ban sandbox
    chrome_options.add_argument(\'--no-sandbox\')
    chrome_options.add_argument(\'--disable-dev-shm-usage\')
    #use headless
    #chrome_options.add_argument(\'--headless\')
    chrome_options.add_argument(\'--disable-gpu\')
    chrome_options.add_argument(\'--ignore-ssl-errors\')
    driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
    #print(\'dynamic laod web is\', site_url)
    driver.set_page_load_timeout(100)
    #driver.set_script_timeout(100)
    try:
        driver.get(site_url)
    except Exception as e:
        driver.execute_script(\'window.stop()\')  # 超出时间则不加载
        print(e, \'dynamic web load timeout\')
    data = driver.page_source
    soup = BeautifulSoup(data, \'html.parser\')
    try:
        driver.quit()
    except:
        pass
    return soup



#获得列表页链接
def get_page_url_list(cate_path):
    cate_url_list = []

    print(\'开始爬取\')
    page_url = \'https://global.11st.co.kr/glb/en/browsing/Category.tmall?method=getCategory2Depth&dispCtgrNo=1001819#\'
    soup = get_dynamic_html(page_url)
    print(soup.prettify())
    one_cate_ul_list = soup.select(\'#lnbMenu > ul > li\')

    for i in range(0, len(one_cate_ul_list)):
        one_cate_ul = one_cate_ul_list[i]
        one_cate_name = one_cate_ul.select(\'a\')[0].text
        one_cate_url = one_cate_ul.select(\'a\')[0].attrs[\'href\']

        two_cate_ul_list = one_cate_ul.select(\'ul.list_category > li\')
        for two_cate_ul in two_cate_ul_list:
            two_cate_name = two_cate_ul.select(\'a\')[0].text
            two_cate_url = two_cate_ul.select(\'a\')[0].attrs[\'href\']

            three_cate_ul_list = two_cate_ul.select(\'li .list_sub_cate > li\')
            for three_cate_ul in three_cate_ul_list:
                three_cate_name = three_cate_ul.select(\'a\')[0].text
                three_cate_url = three_cate_ul.select(\'a\')[0].attrs[\'href\']

                cate_obj = 
                    \'brand\': \'global.11st.co\',
                    \'site\': \'kr\',
                    \'one_cate_name\': one_cate_name,
                    \'one_cate_url\': one_cate_url,
                    \'two_cate_name\': two_cate_name,
                    \'two_cate_url\': two_cate_url,
                    \'three_cate_name\': three_cate_name,
                    \'three_cate_url\': three_cate_url,
                
                cate_url_list.append(cate_obj)

    cate_url_df = pd.DataFrame(cate_url_list)
    cate_url_df.to_excel(cate_path, index=False)



if __name__ == \'__main__\':
    #列表页链接存放位置
    cate_excel_path = \'/Users/xxxx/Downloads/11st_kr_page_list.xlsx\'
    get_page_url_list(cate_excel_path)

  

 

 

 

 第二步

如图每个列表页可以看到总商品数量,每页展示40件商品,可以计算总页数

 

 基于步骤一获得文件,去计算每个列表页的页数

#需要引入的包 都在步骤一

#获得总数量 和 总页数 def account_page_num(cate_path, reocrd_path): out_page_list = [] page_list_df = pd.read_excel(cate_path) for index, row in page_list_df.iterrows(): print(index, row) page_item = \'brand\': row[\'brand\'], \'site\': row[\'site\'], \'one_cate_name\': row[\'one_cate_name\'], \'two_cate_name\': row[\'two_cate_name\'], \'two_cate_url\': row[\'two_cate_url\'], \'three_cate_name\': row[\'three_cate_name\'], \'three_cate_url\': row[\'three_cate_url\'] page_item[\'total_item_num\'] = \'not found tag\' page_item[\'total_page_num\'] = 0 page_item[\'per_page_num\'] = 40 page_item[\'start_page_num\'] = 0 soup = get_static_html(page_item[\'three_cate_url\']) total_num_tag_list = soup.select(\'ul.categ > li.active\') if len(total_num_tag_list) > 0: total_num_tag = total_num_tag_list[0] tag_text = total_num_tag.text num_pattern = re.compile(\'\\(([0-9 ,]+)\\)\') num_arr = num_pattern.findall(tag_text) if len(num_arr) > 0: page_item[\'total_item_num\'] = int(num_arr[0].replace(\',\', \'\')) page_item[\'total_page_num\'] = math.ceil(page_item[\'total_item_num\'] / page_item[\'per_page_num\']) else: page_item[\'total_item_num\'] = f\'text error:tag_text\' print(page_item) out_page_list.append(page_item) record_url_df = pd.DataFrame(out_page_list) record_url_df.to_excel(reocrd_path, index=False) if __name__ == \'__main__\': date_str = \'2023-04-06\' #爬虫记录 记录已经爬取的页数,以防中途爬取失败,不用从头开始爬,可接着爬 crawl_record_path = f\'/Users/xxxx/Downloads/11st_kr_page_reocrd_date_str.xlsx\' account_page_num(cate_excel_path, crawl_record_path)

  

第三步,第四步

 

 代码如下

#需要引入的包都在步骤1


#获得静态的界面
def get_static_html(site_url):
    print(\'开始加载\', site_url, \'页面\')
    headers_list = [
        \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36\',
        \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 \',
        \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2\',
        \'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36\'
    ]
    headers = 
        \'user-agent\': headers_list[random.randint(0,len(headers_list))-1],
        \'Connection\': \'keep - alive\'
    
    try:
        resp = requests.get(site_url, headers=headers)
    except Exception as inst:
        print(inst)
        requests.packages.urllib3.disable_warnings()
        resp = requests.get(site_url, headers=headers,verify=False)
    soup = BeautifulSoup(resp.text, \'html.parser\')
    return soup



#dateframe格式 输出为无url格式的excel
def obj_list_to_df_wihout_url(obj_df, out_path):
    conten_writer = pd.ExcelWriter(out_path, engine=\'xlsxwriter\', options=\'strings_to_urls\': False)
    obj_df.to_excel(conten_writer , index=False)
    conten_writer.close()



#获取列表页的商品信息
def info_from_page_list(index, page_item):
    #爬取最大列表数限制
    max_limit = 250

    #存放位置
    three_cate_name = page_item[\'three_cate_name\'].strip().replace(\' \', \'&\').replace(\'/\', \'&\')
    now_out_path = f"crawl_tmp_dir/index_three_cate_name.xlsx"

    total_page_num = page_item[\'total_page_num\'] if page_item[\'total_page_num\'] <= max_limit else max_limit
    finsh_page_num = page_item[\'finsh_page_num\']
    print(finsh_page_num, total_page_num)

    #如果从头开始
    if finsh_page_num == 0 and not os.path.exists(now_out_path):
        out_goods_list = []
    #接着上次爬取
    else:
        already_obj_df = pd.read_excel(now_out_path)
        out_goods_list = np.array(already_obj_df).to_list()


    if finsh_page_num == total_page_num:
        print(f"index page_item[\'three_cate_name\']  抓取结束")
    for i in range(finsh_page_num, total_page_num):
        page_url = f"page_item[\'three_cate_url\']#pageNum%%i + 1"

        soup = get_static_html(page_url)
        info_tag_list = soup.select(\'ul.tt_listbox > li\')

        for goods_tag in info_tag_list:
            info_item = page_item.copy()
            pattern_tag_3 = re.compile(\'products\\/([0-9]+)\')
            href_tag = goods_tag.select(\'.photo_wrap >  a\')[0]
            desc_tag = goods_tag.select(\'.list_info > .info_tit\')[0]
            #feedback_tag = goods_tag.select(\'.list_info  .sfc\')
            #collect_tag = goods_tag.select(\'.list_info  .def_likethis\')
            price_tag = goods_tag.select(\'.list_price  .dlr\')[0]

            info_item[\'href\'] = href_tag.attrs[\'href\']
            info_item[\'product_id\'] = \'\'
            info_item[\'desc\'] = desc_tag.text
            #info_item[\'feedback\'] = feedback_tag.text
            #info_item[\'collect\'] = collect_tag.text


            info_item[\'price_kr\'] = int(price_tag.attrs[\'data-finalprice\'])
            info_item[\'price_us\'] = round(info_item[\'price_kr\'] * 0.0007959, 2)


            if info_item[\'href\'] != \'\':
                id_arr = pattern_tag_3.findall(info_item[\'href\'])
                if len(id_arr) > 0:
                    info_item[\'product_id\'] = id_arr[0]
            out_goods_list.append(info_item)

        #每50页保存一次
        if i == total_page_num - 1 or i % 50 == 0:
            print(\'开始保存\')
            #临时保存
            out_goods_df = pd.DataFrame(out_goods_list)
            obj_list_to_df_wihout_url(out_goods_df, now_out_path)
            print(\'更新记录\')
            #更新记录
            crawl_record_df = pd.read_excel(crawl_record_path)
            crawl_record_df.loc[index, \'finsh_page_num\'] = i + 1
            print(crawl_record_df.loc[index, \'finsh_page_num\'])
            obj_list_to_df_wihout_url(crawl_record_df, crawl_record_path)



if __name__ == \'__main__\':
    date_str = \'2023-04-06\'
    #本次爬虫记录
    crawl_record_path = f\'/Users/xxx/Downloads/11st_kr_page_reocrd_date_str.xlsx\'
    #临时存放爬取的商品数据目录
    crawl_tmp_dir = f\'/Users/xxx/Downloads/11st_kr_page_reocrd_date_str\'
    if not os.path.exists(crawl_tmp_dir):
        os.mkdir(crawl_tmp_dir)

    crawl_record_df = pd.read_excel(crawl_record_path)
    new_recrod_list = []
    for index, row in crawl_record_df.iterrows():
        info_from_page_list(index, row)

 

最后一步

合并临时存放商品数据的excel

crawl_tmp_dir

Python爬虫-爬取伯乐在线美女邮箱

爬取伯乐在线美女邮箱

 

1.登录界面的进入,设置url,cookie,data,headers

2.进入主页,点击邮箱链接,需要重新设置url,cookie(读取重新保存的cookie),data,headers

  1 \'\'\'
  2 爬取伯乐在线的美女联系方式
  3 需要:
  4 1. 登录
  5 2. 在登录和相应声望值的前提下,提取对方的邮箱
  6 \'\'\'
  7 
  8 from urllib import request, error, parse
  9 from http import cookiejar
 10 import json
 11 
 12 def login():
 13     \'\'\'
 14     输入用户名称和密码
 15     获取相应的登录cookie
 16     cookie 写文件
 17     :return:
 18     \'\'\'
 19 
 20     # 1. 需要找到登录入口
 21     url = "http://date.jobbole.com/wp-login.php"
 22 
 23     # 2. 准备登录数据
 24     data = {
 25         "log": "augsnano",
 26         "pwd": "123456789",
 27         # 登陆后重定向地址
 28         "redirect_to": "http://date.jobbole.com/4965/",
 29         "rememberme": "on"
 30     }
 31 
 32     data = parse.urlencode(data).encode()
 33 
 34 
 35     # 3. 准备存放cookie文件
 36     # r表示不转义
 37     f = r\'jobbole_cookie.txt\'
 38 
 39     # 4. 准备请求头信息
 40     headers = {
 41         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36",
 42         "Connection": "keep-alive"
 43 
 44     }
 45 
 46     # 5. 准备cookie hanlder
 47     cookie_handler = cookiejar.MozillaCookieJar(f)
 48 
 49     # 6. 准备http请求handler
 50     http_handler = request.HTTPCookieProcessor(cookie_handler)
 51 
 52 
 53     # 7. 构建opener
 54     opener = request.build_opener(http_handler)
 55 
 56     # 8. 构建请求对象
 57     req = request.Request(url, data=data, headers=headers)
 58 
 59     # 9. 发送请求
 60     try:
 61         rsp = opener.open(req)
 62 
 63         cookie_handler.save(f, ignore_discard=True, ignore_expires=True)
 64 
 65         html = rsp.read().decode()
 66         print(html)
 67     except error.URLError as e:
 68         print(e)
 69 
 70 
 71 def getInfo():
 72     # 1. 确定url
 73     url = "http://date.jobbole.com/wp-admin/admin-ajax.php"
 74 
 75     # 2. 读取已经保存的cookie
 76     f = r\'jobbole_cookie.txt\'
 77     cookie = cookiejar.MozillaCookieJar()
 78     cookie.load(f, ignore_expires=True, ignore_discard=True)
 79 
 80     # 3. 构建http_handler
 81     http_handler = request.HTTPCookieProcessor(cookie)
 82 
 83     # 4. 构建opener
 84     opener = request.build_opener(http_handler)
 85 
 86     # 以下是准备请求对象的过程
 87 
 88     # 5. 构建data
 89     data = {
 90         "action": "get_date_contact",
 91         "postId": "4965"
 92     }
 93 
 94     data = parse.urlencode(data).encode()
 95 
 96     # 6. 构建请求头
 97     headers = {
 98         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36",
 99         "Connection": "keep-alive"
100     }
101 
102     # 7. 构建请求实体
103     req = request.Request(url, data=data, headers=headers)
104 
105     # 8. 用opener打开
106     try:
107         rsp = opener.open(req)
108         html = rsp.read().decode()
109 
110         html = json.loads(html)
111         print(html)
112 
113         f = "rsp.html"
114         with open(f, \'w\') as f:
115             f.write(html)
116 
117     except Exception as e:
118         print(e)
119 
120 
121 
122 
123 
124 
125 if __name__ == \'__main__\':
126     getInfo()

 

以上是关于在线商城爬虫 带爬取记录 以11TREET 为例的主要内容,如果未能解决你的问题,请参考以下文章

爬虫——模拟点击动态页面

利用Python爬虫爬取淘宝网某类商品的图片

scrapy-redis 分布式 案例一

教学动态|利用Java爬虫技术爬取数据——以微信公众号为例

python爬取百度搜索页面,得到内容不全,求教,why

Python爬虫-爬取伯乐在线美女邮箱