Python爬虫四

Posted hhh江月

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫四相关的知识,希望对你有一定的参考价值。

Python爬虫四

import requests
import re
import os

"""
1、注意headers
2、注意正则表达式的获取
3、注意遍历访问每一个图片
4、文件操作完毕需要关闭
5、注意文件存放的路径的相关的问题
6、需要考虑对方拒绝连接的可能性
end
"""


url = "https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&dyTabStr=MCwzLDIsMSw0LDYsNSw3LDgsOQ%3D%3D&word=%E7%A7%91%E6%8A%80%E5%9B%BE%E7%89%87"
headers = 
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Cookie": "BDqhfp=%E7%A7%91%E6%8A%80%E5%9B%BE%E7%89%87%26%26NaN-1undefined%26%260%26%261; __yjs_duid=1_f902d3b9e299b9224fec9535435c4e3b1627912087751; BIDUPSID=04E6FEA668FBBE518846E6C4F4052586; PSTM=1627951737; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; ZD_ENTRY=bing; BDUSS=QwaG9yWmlacnhtVlk0VHFJSGFOalJoMzdRT0JuakI2U3ZzLXNOZHNkZmJsNE5oRVFBQUFBJCQAAAAAAQAAAAEAAACz7xwqaHV5dXh1YW42NzE1MTMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANsKXGHbClxhRk; BDUSS_BFESS=QwaG9yWmlacnhtVlk0VHFJSGFOalJoMzdRT0JuakI2U3ZzLXNOZHNkZmJsNE5oRVFBQUFBJCQAAAAAAQAAAAEAAACz7xwqaHV5dXh1YW42NzE1MTMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANsKXGHbClxhRk; session_id=1633422044125; session_name=; delPer=0; H_WISE_SIDS=107319_110085_127969_164869_175667_179348_179379_180276_181480_181588_181731_182237_182531_183035_183330_184009_184254_184440_185268_185652_185880_186319_186595_186636_186716_186841_187061_187089_187432_187819_187877_187928_188031_188182_188469_188553_188732_188742_188875_189089_189432_189449_189679_189732_189755_189756_190048_190114_190461_190473_190510_190512_190608_190611_190624_190680_190683_190737_190770_190791_191030_191068_191371_191421_191502_191998_192154_8000097_8000107_8000120_8000138_8000146_8000157_8000160_8000163_8000167_8000176_8000186; rsv_i=c70dTDQvQ9DJPd8BH%2FgY7Mvk6I4A62M2BBwSKHa8%2BOkU%2F6BhC2Pxf5tSBq2uMG5i9MMDTf7YXc5rejYPgIEr4Md%2B8ljbOXE; BCLID=7334502471048592832; BDSFRCVID=d1uOJeC62Rb3wb5Hw_uPulPfr5Zg3rRTH6a5FL3B4VXFVvch3f3PEG0PEf8g0KubRPf5ogKK0mOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF=tbADVCtMJKP3fnTkqR7_MPCfMmT22jnbt6T9aJ5nJDoCHCO6-6o15p400hoQQP5HJGQMo-o4QpP-HD5DX5bIjhKRQRjq5-ke5J7TKl0MLp5tbb0xynoD0l8EjfnMBMPjamOnaIQc3fAKftnOM46JehL3346-35543bRTLnLy5KJtMDcnK4-XDTJBjNbP; BCLID_BFESS=7334502471048592832; BDSFRCVID_BFESS=d1uOJeC62Rb3wb5Hw_uPulPfr5Zg3rRTH6a5FL3B4VXFVvch3f3PEG0PEf8g0KubRPf5ogKK0mOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tbADVCtMJKP3fnTkqR7_MPCfMmT22jnbt6T9aJ5nJDoCHCO6-6o15p400hoQQP5HJGQMo-o4QpP-HD5DX5bIjhKRQRjq5-ke5J7TKl0MLp5tbb0xynoD0l8EjfnMBMPjamOnaIQc3fAKftnOM46JehL3346-35543bRTLnLy5KJtMDcnK4-XDTJBjNbP; BAIDU_WISE_UID=wapp_1638353449868_552; PSINO=6; BAIDUID=4A30803EA7E4624AEF08D3CFECFBCE3D:FG=1; BAIDUID_BFESS=4A30803EA7E4624AEF08D3CFECFBCE3D:FG=1; H_PS_PSSID=35104_31253_34584_35490_34813_35542_35797_35319_26350_35724_35743; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; userFrom=www.baidu.com; BA_HECTOR=21a50l2181812425bj1guvtc00q; ab_sr=1.0.1_ZDliMDU2YmEyM2YzNWVjMjQ5MTYxMTUyNjAyM2Y0M2U0MGFmNmYwZDczMTUyZTFhOGY1NTZkN2E0NmU2Y2E1ODc4Y2RkYjkyZWVmYTNkMTlkZmZlZTA0MDYxOTZhYTdjY2JiOWI5ODVkM2EwZTZhMTZhZTBhZWVhZTMzMzgyZTg=",
    "Host": "image.baidu.com",
    "Pragma": "no-cache",
    "Referer": "https://www.baidu.com/s?wd=%E7%A7%91%E6%8A%80%E5%9B%BE%E7%89%87&rsv_spt=1&rsv_iqid=0xb3d54ea200021ed3&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_enter=0&rsv_dl=tb&rsv_t=d456rVqSt9ihoMGT%2FYOAjQtBr6FNxnElEE%2FhvuXkiwrkYKDvyORyLrdta2tLzWWIkn3M&oq=%25E7%25A7%2591%25E6%258A%2580%25E5%259B%25BE%25E7%2589%2587&rsv_pq=acf504fa00040108&rsv_btype=t&prefixsug=%25E7%25A7%2591%25E6%258A%2580%25E5%259B%25BE%25E7%2589%2587&rsp=7",
    "sec-ch-ua": '" Not;A Brand";v="99", "Microsoft Edge";v="97", "Chromium";v="97"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"Windows"',
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69"

# 为了可以安全访问,需要设置比较多的headers

res = requests.get(url=url, headers=headers)
# 访问百度搜索科技图片的网站

print(res.status_code)
print(res.content.decode())
with open("baidu.html", "w+") as f:
    # 记录网页的源代码

    f.write(res.content.decode())
    f.close()

"""
<img class="main_img img-hover" data-imgurl="https://img0.baidu.com/it/u=1770203103,1319507463&amp;fm=253&amp;fmt=auto&amp;app=138&amp;f=JPEG?w=780&amp;h=284" src="https://img0.baidu.com/it/u=1770203103,1319507463&amp;fm=253&amp;fmt=auto&amp;app=138&amp;f=JPEG?w=780&amp;h=284" style="width: 378px; height: 200px;">
<img class="main_img img-hover" data-imgurl="https://img0.baidu.com/it/u=1197544951,1783440312&amp;fm=253&amp;fmt=auto&amp;app=138&amp;f=JPEG?w=801&amp;h=400" src="https://img0.baidu.com/it/u=1197544951,1783440312&amp;fm=253&amp;fmt=auto&amp;app=138&amp;f=JPEG?w=801&amp;h=400" style="width: 348px; height: 200px;">
"""

# 这个正则表达式比较奇怪,没有从源代码中找到。
data = re.findall(r'"objURL":"(.*?)"', res.content.decode())
# 正则表达式查看有关的链接

print(data)
with open("baidu.txt", "w+") as file:
    # 记录每一个图片的链接
    for i in data:
        file.write(f"i+\\n")
    file.close()
    # for循环结束以后才可以关闭文件


num = 1
for j in data:
    # 遍历访问每一个图片的地址

    headers_pic = 
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69"
    
    res_pic = requests.get(url=j, headers=headers_pic)
    # 访问每一个图片

    # with open(os.getcwd() + "\\crawlering_basic\\\\baidu_tech_pic" + f"num.jpg", "wb") as f0:
    """
    如果忘记写了\\\\的话,会导致图片存放在当前文件夹同一级的文件夹里面而不是当前文件夹的里面。
    """

    with open(os.getcwd() + "\\crawlering_basic\\\\baidu_tech_pic\\\\" + f"num.jpg", "wb") as f0:
        # 记录所有的图片。
        f0.write(res_pic.content)
        f0.close()
        # 记录完毕图片以后需要关闭文件

    num += 1
    # 每次需要加一



这个爬虫有点麻烦:

设置了不少的代理等的内容。

import requests
import re
import os

"""
1、注意headers
2、注意正则表达式的获取
3、注意遍历访问每一个图片
4、文件操作完毕需要关闭
5、注意文件存放的路径的相关的问题
6、需要考虑对方拒绝连接的可能性
end
"""


url = "https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&dyTabStr=MCwzLDIsMSw0LDYsNSw3LDgsOQ%3D%3D&word=%E7%A7%91%E6%8A%80%E5%9B%BE%E7%89%87"
headers = 
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Cookie": "BDqhfp=%E7%A7%91%E6%8A%80%E5%9B%BE%E7%89%87%26%26NaN-1undefined%26%260%26%261; __yjs_duid=1_f902d3b9e299b9224fec9535435c4e3b1627912087751; BIDUPSID=04E6FEA668FBBE518846E6C4F4052586; PSTM=1627951737; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; ZD_ENTRY=bing; BDUSS=QwaG9yWmlacnhtVlk0VHFJSGFOalJoMzdRT0JuakI2U3ZzLXNOZHNkZmJsNE5oRVFBQUFBJCQAAAAAAQAAAAEAAACz7xwqaHV5dXh1YW42NzE1MTMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANsKXGHbClxhRk; BDUSS_BFESS=QwaG9yWmlacnhtVlk0VHFJSGFOalJoMzdRT0JuakI2U3ZzLXNOZHNkZmJsNE5oRVFBQUFBJCQAAAAAAQAAAAEAAACz7xwqaHV5dXh1YW42NzE1MTMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANsKXGHbClxhRk; session_id=1633422044125; session_name=; delPer=0; H_WISE_SIDS=107319_110085_127969_164869_175667_179348_179379_180276_181480_181588_181731_182237_182531_183035_183330_184009_184254_184440_185268_185652_185880_186319_186595_186636_186716_186841_187061_187089_187432_187819_187877_187928_188031_188182_188469_188553_188732_188742_188875_189089_189432_189449_189679_189732_189755_189756_190048_190114_190461_190473_190510_190512_190608_190611_190624_190680_190683_190737_190770_190791_191030_191068_191371_191421_191502_191998_192154_8000097_8000107_8000120_8000138_8000146_8000157_8000160_8000163_8000167_8000176_8000186; rsv_i=c70dTDQvQ9DJPd8BH%2FgY7Mvk6I4A62M2BBwSKHa8%2BOkU%2F6BhC2Pxf5tSBq2uMG5i9MMDTf7YXc5rejYPgIEr4Md%2B8ljbOXE; BCLID=7334502471048592832; BDSFRCVID=d1uOJeC62Rb3wb5Hw_uPulPfr5Zg3rRTH6a5FL3B4VXFVvch3f3PEG0PEf8g0KubRPf5ogKK0mOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF=tbADVCtMJKP3fnTkqR7_MPCfMmT22jnbt6T9aJ5nJDoCHCO6-6o15p400hoQQP5HJGQMo-o4QpP-HD5DX5bIjhKRQRjq5-ke5J7TKl0MLp5tbb0xynoD0l8EjfnMBMPjamOnaIQc3fAKftnOM46JehL3346-35543bRTLnLy5KJtMDcnK4-XDTJBjNbP; BCLID_BFESS=7334502471048592832; BDSFRCVID_BFESS=d1uOJeC62Rb3wb5Hw_uPulPfr5Zg3rRTH6a5FL3B4VXFVvch3f3PEG0PEf8g0KubRPf5ogKK0mOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tbADVCtMJKP3fnTkqR7_MPCfMmT22jnbt6T9aJ5nJDoCHCO6-6o15p400hoQQP5HJGQMo-o4QpP-HD5DX5bIjhKRQRjq5-ke5J7TKl0MLp5tbb0xynoD0l8EjfnMBMPjamOnaIQc3fAKftnOM46JehL3346-35543bRTLnLy5KJtMDcnK4-XDTJBjNbP; BAIDU_WISE_UID=wapp_1638353449868_552; PSINO=6; BAIDUID=4A30803EA7E4624AEF08D3CFECFBCE3D:FG=1; BAIDUID_BFESS=4A30803EA7E4624AEF08D3CFECFBCE3D:FG=1; H_PS_PSSID=35104_31253_34584_35490_34813_35542_35797_35319_26350_35724_35743; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; userFrom=www.baidu.com; BA_HECTOR=21a50l2181812425bj1guvtc00q; ab_sr=1.0.1_ZDliMDU2YmEyM2YzNWVjMjQ5MTYxMTUyNjAyM2Y0M2U0MGFmNmYwZDczMTUyZTFhOGY1NTZkN2E0NmU2Y2E1ODc4Y2RkYjkyZWVmYTNkMTlkZmZlZTA0MDYxOTZhYTdjY2JiOWI5ODVkM2EwZTZhMTZhZTBhZWVhZTMzMzgyZTg=",
    "Host": "image.baidu.com",
    "Pragma": "no-cache",
    "Referer": "https://www.baidu.com/s?wd=%E7%A7%91%E6%8A%80%E5%9B%BE%E7%89%87&rsv_spt=1&rsv_iqid=0xb3d54ea200021ed3&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_enter=0&rsv_dl=tb&rsv_t=d456rVqSt9ihoMGT%2FYOAjQtBr6FNxnElEE%2FhvuXkiwrkYKDvyORyLrdta2tLzWWIkn3M&oq=%25E7%25A7%2591%25E6%258A%2580%25E5%259B%25BE%25E7%2589%2587&rsv_pq=acf504fa00040108&rsv_btype=t&prefixsug=%25E7%25A7%2591%25E6%258A%2580%25E5%259B%25BE%25E7%2589%2587&rsp=7",
    "sec-ch-ua": '" Not;A Brand";v="99", "Microsoft Edge";v="97", "Chromium";v="97"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"Windows"',
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69"

# 为了可以安全访问,需要设置比较多的headers

res = requests.get(url=url, headers=headers)
# 访问百度搜索科技图片的网站

print(res.status_code)
print(res.content.decode())
with open("baidu.html", "w+") as f:
    # 记录网页的源代码

    f.write(res.content.decode())
    f.close()

"""
<img class="main_img img-hover" data-imgurl="https://img0.baidu.com/it/u=1770203103,1319507463&amp;fm=253&amp;fmt=auto&amp;app=138&amp;f=JPEG?w=780&amp;h=284" src="https://img0.baidu.com/it/u=1770203103,1319507463&amp;fm=253&amp;fmt=auto&amp;app=138&amp;f=JPEG?w=780&amp;h=284" style="width: 378px; height: 200px;">
<img class="main_img img-hover" data-imgurl="https://img0.baidu.com/it/u=1197544951,1783440312&amp;fm=253&amp;fmt=auto&amp;app=138&amp;f=JPEG?w=801&amp;h=400" src="https://img0.baidu.com/it/u=1197544951,1783440312&amp;fm=253&amp;fmt=auto&amp;app=138&amp;f=JPEG?w=801&amp;h=400" style="width: 348px; height: 200px;">
"""

# 这个正则表达式比较奇怪,没有从源代码中找到。
data = re.findall(r'"objURL":"(.*?)"', res.content.decode())
# 正则表达式查看有关的链接

print(data)
with open("baidu.txt", "w+") as file:
    # 记录每一个图片的链接
    for i in data:
        file.write(f"i+\\n")
    file.close()
    # for循环结束以后才可以关闭文件


num = 1
for j in data:
    # 遍历访问每一个图片的地址

    headers_pic = 
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69"
    
    res_pic = requests.get(url=j, headers=headers_pic)
    # 访问每一个图片

    # with open(os.getcwd() + "\\crawlering_basic\\\\baidu_tech_pic" + f"num.jpg", "wb") as f0:
    """
    如果忘记写了\\\\的话,会导致图片存放在当前文件夹同一级的文件夹里面而不是当前文件夹的里面。
    """

    with open(os.getcwd() + "\\crawlering_basic\\\\baidu_tech_pic\\\\" + f"num.jpg", "wb") as f0:
        # 记录所有的图片。
        f0.write(res_pic.content)
        f0.close()
        # 记录完毕图片以后需要关闭文件

    num += 1
    # 每次需要加一



以上是关于Python爬虫四的主要内容,如果未能解决你的问题,请参考以下文章

Python爬虫利器四之PhantomJS的用法

Python爬虫利器四之PhantomJS的用法

Python爬虫学习:四headers和data的获取

Python爬虫实战四之抓取淘宝MM照片

Python爬虫实战四之抓取淘宝MM照片

转 Python爬虫入门四之Urllib库的高级用法