pythonp爬虫 爬取百度音乐

Posted 编程人生改变命运

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pythonp爬虫 爬取百度音乐相关的知识,希望对你有一定的参考价值。

#coding=utf-8

import requests
import re
import time
from bs4 import BeautifulSoup

def spider():
    for i in range(100,151):
        start = i * 20
        url = http://music.baidu.com/tag/%E6%96%B0%E6%AD%8C?size=20&start=+ str(start)+&third_type=0
        print url
        headers = {

            "Host":"music.baidu.com",
            "Connection":"keep-alive",
            "Cache-Control":"max-age=0",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Referer":url,
            "Accept-Encoding":"gzip, deflate",
            "Accept-Language":"zh-CN,zh;q=0.9",
            # "Cookie":"checkStatus=true; BIDUPSID=F76081B6DCEF178EB115E76CFFABDFFF; PSTM=1490192233; __cfduid=dc0607f001fdddad698f98a17b619d9461517674324; BAIDUID=FCBB590CDE88FE3F4965949AD0A91252:FG=1; MCITY=-%3A; BDUSS=FXUDdYdmVacmV3cC1nNXhnM2RlRi1UWEw3dTFuUzdjSHFvTXZaTlpmdGktUnRiQVFBQUFBJCQAAAAAAAAAAAEAAACeLk0x0O20usHWMTY4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGJs9FpibPRaQl; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=13290_1434_21114_20883_20929; PSINO=2; BCLID=13234662273182259149; BDSFRCVID=LeIsJeC6246SbPQAU-w6KwKAG0BRyj7TH6-JNTcy6f-W_zkxmhlfEG0PqU8g0Ku-jgO9ogKK0mOTHvjP; H_BDCLCKID_SF=tJkt_K-aJKvjD4-k247Hhn8thmT22-usBITAQhcH0KLKMKQb-l3GLqQD5Nji-MnC3bRGbtT_JMb1M66_XlOj2jKEqJJdhtnOaCbQ0q5TtUJaSDnTDMRhqtIsXNryKMnitIj9-pnK2ft0HPonHjKhejv-3f; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BAIDU_DUP_lcr=https://www.duba.com/?f=qd_sch; userid=827141790; app_vip=show; Hm_lvt_d0ad46e4afeacf34cd12de4c9b553aa6=1526222318; u_id=; u_t=; UM_distinctid=16359f080b3a3-0802715d516d47-454c092b-ff000-16359f080b450a; CNZZDATA1262632547=1637929121-1526217035-http%253A%252F%252Fmusic.baidu.com%252F%7C1526217035; u_lo=0; checkStatus=true; tracesrc=-1%7C%7C-1; Hm_lpvt_d0ad46e4afeacf34cd12de4c9b553aa6=1526222739",




        }

        result = requests.session().get(url=url,headers=headers)

        if result.status_code ==200:
            result_html = result.content
            # print result_html

            soup = BeautifulSoup(result_html,html.parser)
            result_divs = soup.find_all(div,attrs={"class":"song-item clearfix "})
            print len(result_divs)
            for result_div in result_divs:
                result_replace = str(result_div).replace(\r\n\t,<br/>).replace(\n\t,<br/>).replace(\n,<br/>)
                print result_replace
                index_num = re.findall(<span class="index-num index-hook" style="width: 25px;">(.*?)</span><span class="song-info,result_replace)[0]
                song_url_name = re.findall(href="(.*?)" target="_blank" title="(.*?)</a><div class="extra-info">,result_replace)[0]
                song_url = song_url_name[0]
                song_name = song_url_name[1]

                if <span class="appendix"> in result_replace:
                    try:
                        appendix = re.findall(<div class="extra-info"><span class="appendix">(.*?)</span></div>,str(result_replace))[0]
                    except:
                        appendix = re.findall(<span class="appendix">(.*?)</span>, str(result_replace))[0]
                else:
                    appendix = ""


                author_list = re.findall(<span class="author_list" title="(.*?)">,result_replace)[0]
                if <a hidefocus="true" href= in result_replace:
                    author_url = re.findall(<a hidefocus="true" href="(.*?)" target="_blank">,result_replace)[0]
                    author_url = "http://music.baidu.com/" + author_url
                else:
                    author_url = ""

                song_url = "http://music.baidu.com/" + song_url


                # print author_url
                # print song_url
                print author_list
                # print appendix
                # print index_num
                # print song_url
                print song_name

                print "="* 88
            # time.sleep(2)






spider()

 

以上是关于pythonp爬虫 爬取百度音乐的主要内容,如果未能解决你的问题,请参考以下文章

Pythonp爬虫BeautifulSoup4

Python爬虫 - 爬取百度html代码前200行

如何用Python网络爬虫爬取网易云音乐歌曲

Python Scrapy的QQ音乐爬虫 音乐下载爬取歌曲信息歌词精彩评论

Java爬虫系列之实战:爬取酷狗音乐网 TOP500 的歌曲

Python爬虫实践--爬取网易云音乐